1 /*
2 * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #ifndef _WINDOWS
26 #include "alloca.h"
27 #endif
28 #include "asm/macroAssembler.hpp"
29 #include "asm/macroAssembler.inline.hpp"
30 #include "classfile/symbolTable.hpp"
31 #include "code/aotCodeCache.hpp"
32 #include "code/compiledIC.hpp"
33 #include "code/debugInfoRec.hpp"
34 #include "code/nativeInst.hpp"
35 #include "code/vtableStubs.hpp"
36 #include "compiler/oopMap.hpp"
37 #include "gc/shared/collectedHeap.hpp"
38 #include "gc/shared/gcLocker.hpp"
39 #include "gc/shared/barrierSet.hpp"
40 #include "gc/shared/barrierSetAssembler.hpp"
41 #include "interpreter/interpreter.hpp"
42 #include "logging/log.hpp"
43 #include "memory/resourceArea.hpp"
44 #include "memory/universe.hpp"
45 #include "oops/klass.inline.hpp"
46 #include "oops/method.inline.hpp"
47 #include "prims/methodHandles.hpp"
48 #include "runtime/continuation.hpp"
49 #include "runtime/continuationEntry.inline.hpp"
50 #include "runtime/globals.hpp"
51 #include "runtime/jniHandles.hpp"
52 #include "runtime/safepointMechanism.hpp"
53 #include "runtime/sharedRuntime.hpp"
54 #include "runtime/signature.hpp"
55 #include "runtime/stubRoutines.hpp"
56 #include "runtime/timerTrace.hpp"
57 #include "runtime/vframeArray.hpp"
58 #include "runtime/vm_version.hpp"
59 #include "utilities/align.hpp"
60 #include "utilities/checkedCast.hpp"
61 #include "utilities/formatBuffer.hpp"
62 #include "vmreg_x86.inline.hpp"
63 #ifdef COMPILER1
64 #include "c1/c1_Runtime1.hpp"
65 #endif
66 #ifdef COMPILER2
67 #include "opto/runtime.hpp"
68 #endif
69 #if INCLUDE_JVMCI
70 #include "jvmci/jvmciJavaClasses.hpp"
71 #endif
72
73 #define __ masm->
74
75 #ifdef PRODUCT
76 #define BLOCK_COMMENT(str) /* nothing */
77 #else
78 #define BLOCK_COMMENT(str) __ block_comment(str)
79 #endif // PRODUCT
80
81 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
82
83 class RegisterSaver {
84 // Capture info about frame layout. Layout offsets are in jint
85 // units because compiler frame slots are jints.
86 #define XSAVE_AREA_BEGIN 160
87 #define XSAVE_AREA_YMM_BEGIN 576
88 #define XSAVE_AREA_EGPRS 960
89 #define XSAVE_AREA_OPMASK_BEGIN 1088
90 #define XSAVE_AREA_ZMM_BEGIN 1152
91 #define XSAVE_AREA_UPPERBANK 1664
92 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
93 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
94 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
95 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off
96 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
97 enum layout {
98 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
99 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area
100 DEF_XMM_OFFS(0),
101 DEF_XMM_OFFS(1),
102 // 2..15 are implied in range usage
103 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
104 DEF_YMM_OFFS(0),
105 DEF_YMM_OFFS(1),
106 r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
107 r16H_off,
108 r17_off, r17H_off,
109 r18_off, r18H_off,
110 r19_off, r19H_off,
111 r20_off, r20H_off,
112 r21_off, r21H_off,
113 r22_off, r22H_off,
114 r23_off, r23H_off,
115 r24_off, r24H_off,
116 r25_off, r25H_off,
117 r26_off, r26H_off,
118 r27_off, r27H_off,
119 r28_off, r28H_off,
120 r29_off, r29H_off,
121 r30_off, r30H_off,
122 r31_off, r31H_off,
123 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
124 DEF_OPMASK_OFFS(0),
125 DEF_OPMASK_OFFS(1),
126 // 2..7 are implied in range usage
127 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
128 DEF_ZMM_OFFS(0),
129 DEF_ZMM_OFFS(1),
130 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
131 DEF_ZMM_UPPER_OFFS(16),
132 DEF_ZMM_UPPER_OFFS(17),
133 // 18..31 are implied in range usage
134 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
135 fpu_stateH_end,
136 r15_off, r15H_off,
137 r14_off, r14H_off,
138 r13_off, r13H_off,
139 r12_off, r12H_off,
140 r11_off, r11H_off,
141 r10_off, r10H_off,
142 r9_off, r9H_off,
143 r8_off, r8H_off,
144 rdi_off, rdiH_off,
145 rsi_off, rsiH_off,
146 ignore_off, ignoreH_off, // extra copy of rbp
147 rsp_off, rspH_off,
148 rbx_off, rbxH_off,
149 rdx_off, rdxH_off,
150 rcx_off, rcxH_off,
151 rax_off, raxH_off,
152 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
153 align_off, alignH_off,
154 flags_off, flagsH_off,
155 // The frame sender code expects that rbp will be in the "natural" place and
156 // will override any oopMap setting for it. We must therefore force the layout
157 // so that it agrees with the frame sender code.
158 rbp_off, rbpH_off, // copy of rbp we will restore
159 return_off, returnH_off, // slot for return address
160 reg_save_size // size in compiler stack slots
161 };
162
163 public:
164 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
165 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
166
167 // Offsets into the register save area
168 // Used by deoptimization when it is managing result register
169 // values on its own
170
171 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; }
172 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; }
173 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; }
174 static int r15_offset_in_bytes(void) { return BytesPerInt * r15_off; }
175 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; }
176 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
177
178 // During deoptimization only the result registers need to be restored,
179 // all the other values have already been extracted.
180 static void restore_result_registers(MacroAssembler* masm);
181 };
182
183 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
184 int off = 0;
185 int num_xmm_regs = XMMRegister::available_xmm_registers();
186 #if COMPILER2_OR_JVMCI
187 if (save_wide_vectors && UseAVX == 0) {
188 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
189 }
190 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
191 #else
192 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
193 #endif
194
195 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
196 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
197 // OopMap frame size is in compiler stack slots (jint's) not bytes or words
198 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
199 // CodeBlob frame size is in words.
200 int frame_size_in_words = frame_size_in_bytes / wordSize;
201 *total_frame_words = frame_size_in_words;
202
203 // Save registers, fpu state, and flags.
204 // We assume caller has already pushed the return address onto the
205 // stack, so rsp is 8-byte aligned here.
206 // We push rpb twice in this sequence because we want the real rbp
207 // to be under the return like a normal enter.
208
209 __ enter(); // rsp becomes 16-byte aligned here
210 __ pushf();
211 // Make sure rsp stays 16-byte aligned
212 __ subq(rsp, 8);
213 // Push CPU state in multiple of 16 bytes
214 __ save_legacy_gprs();
215 __ push_FPU_state();
216
217
218 // push cpu state handles this on EVEX enabled targets
219 if (save_wide_vectors) {
220 // Save upper half of YMM registers(0..15)
221 int base_addr = XSAVE_AREA_YMM_BEGIN;
222 for (int n = 0; n < 16; n++) {
223 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
224 }
225 if (VM_Version::supports_evex()) {
226 // Save upper half of ZMM registers(0..15)
227 base_addr = XSAVE_AREA_ZMM_BEGIN;
228 for (int n = 0; n < 16; n++) {
229 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
230 }
231 // Save full ZMM registers(16..num_xmm_regs)
232 base_addr = XSAVE_AREA_UPPERBANK;
233 off = 0;
234 int vector_len = Assembler::AVX_512bit;
235 for (int n = 16; n < num_xmm_regs; n++) {
236 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
237 }
238 #if COMPILER2_OR_JVMCI
239 base_addr = XSAVE_AREA_OPMASK_BEGIN;
240 off = 0;
241 for(int n = 0; n < KRegister::number_of_registers; n++) {
242 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
243 }
244 #endif
245 }
246 } else {
247 if (VM_Version::supports_evex()) {
248 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
249 int base_addr = XSAVE_AREA_UPPERBANK;
250 off = 0;
251 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit;
252 for (int n = 16; n < num_xmm_regs; n++) {
253 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
254 }
255 #if COMPILER2_OR_JVMCI
256 base_addr = XSAVE_AREA_OPMASK_BEGIN;
257 off = 0;
258 for(int n = 0; n < KRegister::number_of_registers; n++) {
259 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
260 }
261 #endif
262 }
263 }
264
265 #if COMPILER2_OR_JVMCI
266 if (UseAPX) {
267 int base_addr = XSAVE_AREA_EGPRS;
268 off = 0;
269 for (int n = 16; n < Register::number_of_registers; n++) {
270 __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
271 }
272 }
273 #endif
274
275 __ vzeroupper();
276 if (frame::arg_reg_save_area_bytes != 0) {
277 // Allocate argument register save area
278 __ subptr(rsp, frame::arg_reg_save_area_bytes);
279 }
280
281 // Set an oopmap for the call site. This oopmap will map all
282 // oop-registers and debug-info registers as callee-saved. This
283 // will allow deoptimization at this safepoint to find all possible
284 // debug-info recordings, as well as let GC find all oops.
285
286 OopMapSet *oop_maps = new OopMapSet();
287 OopMap* map = new OopMap(frame_size_in_slots, 0);
288
289 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
290
291 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
292 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
293 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
294 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
295 // rbp location is known implicitly by the frame sender code, needs no oopmap
296 // and the location where rbp was saved by is ignored
297 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
298 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
299 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg());
300 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg());
301 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
302 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
303 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
304 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
305 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
306 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
307
308 if (UseAPX) {
309 map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
310 map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
311 map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
312 map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
313 map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
314 map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
315 map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
316 map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
317 map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
318 map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
319 map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
320 map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
321 map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
322 map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
323 map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
324 map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
325 }
326 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
327 // on EVEX enabled targets, we get it included in the xsave area
328 off = xmm0_off;
329 int delta = xmm1_off - off;
330 for (int n = 0; n < 16; n++) {
331 XMMRegister xmm_name = as_XMMRegister(n);
332 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
333 off += delta;
334 }
335 if (UseAVX > 2) {
336 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
337 off = zmm16_off;
338 delta = zmm17_off - off;
339 for (int n = 16; n < num_xmm_regs; n++) {
340 XMMRegister zmm_name = as_XMMRegister(n);
341 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
342 off += delta;
343 }
344 }
345
346 #if COMPILER2_OR_JVMCI
347 if (save_wide_vectors) {
348 // Save upper half of YMM registers(0..15)
349 off = ymm0_off;
350 delta = ymm1_off - ymm0_off;
351 for (int n = 0; n < 16; n++) {
352 XMMRegister ymm_name = as_XMMRegister(n);
353 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
354 off += delta;
355 }
356 if (VM_Version::supports_evex()) {
357 // Save upper half of ZMM registers(0..15)
358 off = zmm0_off;
359 delta = zmm1_off - zmm0_off;
360 for (int n = 0; n < 16; n++) {
361 XMMRegister zmm_name = as_XMMRegister(n);
362 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
363 off += delta;
364 }
365 }
366 }
367 #endif // COMPILER2_OR_JVMCI
368
369 // %%% These should all be a waste but we'll keep things as they were for now
370 if (true) {
371 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
372 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
373 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
374 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
375 // rbp location is known implicitly by the frame sender code, needs no oopmap
376 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
377 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
378 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next());
379 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next());
380 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
381 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
382 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
383 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
384 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
385 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
386 if (UseAPX) {
387 map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
388 map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
389 map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
390 map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
391 map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
392 map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
393 map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
394 map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
395 map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
396 map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
397 map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
398 map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
399 map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
400 map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
401 map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
402 map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
403 }
404 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
405 // on EVEX enabled targets, we get it included in the xsave area
406 off = xmm0H_off;
407 delta = xmm1H_off - off;
408 for (int n = 0; n < 16; n++) {
409 XMMRegister xmm_name = as_XMMRegister(n);
410 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
411 off += delta;
412 }
413 if (UseAVX > 2) {
414 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
415 off = zmm16H_off;
416 delta = zmm17H_off - off;
417 for (int n = 16; n < num_xmm_regs; n++) {
418 XMMRegister zmm_name = as_XMMRegister(n);
419 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
420 off += delta;
421 }
422 }
423 }
424
425 return map;
426 }
427
428 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
429 int num_xmm_regs = XMMRegister::available_xmm_registers();
430 if (frame::arg_reg_save_area_bytes != 0) {
431 // Pop arg register save area
432 __ addptr(rsp, frame::arg_reg_save_area_bytes);
433 }
434
435 #if COMPILER2_OR_JVMCI
436 if (restore_wide_vectors) {
437 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
438 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
439 }
440 #else
441 assert(!restore_wide_vectors, "vectors are generated only by C2");
442 #endif
443
444 __ vzeroupper();
445
446 // On EVEX enabled targets everything is handled in pop fpu state
447 if (restore_wide_vectors) {
448 // Restore upper half of YMM registers (0..15)
449 int base_addr = XSAVE_AREA_YMM_BEGIN;
450 for (int n = 0; n < 16; n++) {
451 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
452 }
453 if (VM_Version::supports_evex()) {
454 // Restore upper half of ZMM registers (0..15)
455 base_addr = XSAVE_AREA_ZMM_BEGIN;
456 for (int n = 0; n < 16; n++) {
457 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
458 }
459 // Restore full ZMM registers(16..num_xmm_regs)
460 base_addr = XSAVE_AREA_UPPERBANK;
461 int vector_len = Assembler::AVX_512bit;
462 int off = 0;
463 for (int n = 16; n < num_xmm_regs; n++) {
464 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
465 }
466 #if COMPILER2_OR_JVMCI
467 base_addr = XSAVE_AREA_OPMASK_BEGIN;
468 off = 0;
469 for (int n = 0; n < KRegister::number_of_registers; n++) {
470 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
471 }
472 #endif
473 }
474 } else {
475 if (VM_Version::supports_evex()) {
476 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
477 int base_addr = XSAVE_AREA_UPPERBANK;
478 int off = 0;
479 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit;
480 for (int n = 16; n < num_xmm_regs; n++) {
481 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
482 }
483 #if COMPILER2_OR_JVMCI
484 base_addr = XSAVE_AREA_OPMASK_BEGIN;
485 off = 0;
486 for (int n = 0; n < KRegister::number_of_registers; n++) {
487 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
488 }
489 #endif
490 }
491 }
492
493 #if COMPILER2_OR_JVMCI
494 if (UseAPX) {
495 int base_addr = XSAVE_AREA_EGPRS;
496 int off = 0;
497 for (int n = 16; n < Register::number_of_registers; n++) {
498 __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
499 }
500 }
501 #endif
502
503 // Recover CPU state
504 __ pop_FPU_state();
505 __ restore_legacy_gprs();
506 __ addq(rsp, 8);
507 __ popf();
508 // Get the rbp described implicitly by the calling convention (no oopMap)
509 __ pop(rbp);
510 }
511
512 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
513
514 // Just restore result register. Only used by deoptimization. By
515 // now any callee save register that needs to be restored to a c2
516 // caller of the deoptee has been extracted into the vframeArray
517 // and will be stuffed into the c2i adapter we create for later
518 // restoration so only result registers need to be restored here.
519
520 // Restore fp result register
521 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
522 // Restore integer result register
523 __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
524 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
525
526 // Pop all of the register save are off the stack except the return address
527 __ addptr(rsp, return_offset_in_bytes());
528 }
529
530 // Is vector's size (in bytes) bigger than a size saved by default?
531 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
532 bool SharedRuntime::is_wide_vector(int size) {
533 return size > 16;
534 }
535
536 // ---------------------------------------------------------------------------
537 // Read the array of BasicTypes from a signature, and compute where the
538 // arguments should go. Values in the VMRegPair regs array refer to 4-byte
539 // quantities. Values less than VMRegImpl::stack0 are registers, those above
540 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer
541 // as framesizes are fixed.
542 // VMRegImpl::stack0 refers to the first slot 0(sp).
543 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
544 // Register up to Register::number_of_registers are the 64-bit
545 // integer registers.
546
547 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
548 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit
549 // units regardless of build. Of course for i486 there is no 64 bit build
550
551 // The Java calling convention is a "shifted" version of the C ABI.
552 // By skipping the first C ABI register we can call non-static jni methods
553 // with small numbers of arguments without having to shuffle the arguments
554 // at all. Since we control the java ABI we ought to at least get some
555 // advantage out of it.
556
557 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
558 VMRegPair *regs,
559 int total_args_passed) {
560
561 // Create the mapping between argument positions and
562 // registers.
563 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
564 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
565 };
566 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
567 j_farg0, j_farg1, j_farg2, j_farg3,
568 j_farg4, j_farg5, j_farg6, j_farg7
569 };
570
571
572 uint int_args = 0;
573 uint fp_args = 0;
574 uint stk_args = 0;
575
576 for (int i = 0; i < total_args_passed; i++) {
577 switch (sig_bt[i]) {
578 case T_BOOLEAN:
579 case T_CHAR:
580 case T_BYTE:
581 case T_SHORT:
582 case T_INT:
583 if (int_args < Argument::n_int_register_parameters_j) {
584 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
585 } else {
586 stk_args = align_up(stk_args, 2);
587 regs[i].set1(VMRegImpl::stack2reg(stk_args));
588 stk_args += 1;
589 }
590 break;
591 case T_VOID:
592 // halves of T_LONG or T_DOUBLE
593 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
594 regs[i].set_bad();
595 break;
596 case T_LONG:
597 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
598 // fall through
599 case T_OBJECT:
600 case T_ARRAY:
601 case T_ADDRESS:
602 if (int_args < Argument::n_int_register_parameters_j) {
603 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
604 } else {
605 stk_args = align_up(stk_args, 2);
606 regs[i].set2(VMRegImpl::stack2reg(stk_args));
607 stk_args += 2;
608 }
609 break;
610 case T_FLOAT:
611 if (fp_args < Argument::n_float_register_parameters_j) {
612 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
613 } else {
614 stk_args = align_up(stk_args, 2);
615 regs[i].set1(VMRegImpl::stack2reg(stk_args));
616 stk_args += 1;
617 }
618 break;
619 case T_DOUBLE:
620 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
621 if (fp_args < Argument::n_float_register_parameters_j) {
622 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
623 } else {
624 stk_args = align_up(stk_args, 2);
625 regs[i].set2(VMRegImpl::stack2reg(stk_args));
626 stk_args += 2;
627 }
628 break;
629 default:
630 ShouldNotReachHere();
631 break;
632 }
633 }
634
635 return stk_args;
636 }
637
638 // Same as java_calling_convention() but for multiple return
639 // values. There's no way to store them on the stack so if we don't
640 // have enough registers, multiple values can't be returned.
641 const uint SharedRuntime::java_return_convention_max_int = Argument::n_int_register_parameters_j+1;
642 const uint SharedRuntime::java_return_convention_max_float = Argument::n_float_register_parameters_j;
643 int SharedRuntime::java_return_convention(const BasicType *sig_bt,
644 VMRegPair *regs,
645 int total_args_passed) {
646 // Create the mapping between argument positions and
647 // registers.
648 static const Register INT_ArgReg[java_return_convention_max_int] = {
649 rax, j_rarg5, j_rarg4, j_rarg3, j_rarg2, j_rarg1, j_rarg0
650 };
651 static const XMMRegister FP_ArgReg[java_return_convention_max_float] = {
652 j_farg0, j_farg1, j_farg2, j_farg3,
653 j_farg4, j_farg5, j_farg6, j_farg7
654 };
655
656
657 uint int_args = 0;
658 uint fp_args = 0;
659
660 for (int i = 0; i < total_args_passed; i++) {
661 switch (sig_bt[i]) {
662 case T_BOOLEAN:
663 case T_CHAR:
664 case T_BYTE:
665 case T_SHORT:
666 case T_INT:
667 if (int_args < Argument::n_int_register_parameters_j+1) {
668 regs[i].set1(INT_ArgReg[int_args]->as_VMReg());
669 int_args++;
670 } else {
671 return -1;
672 }
673 break;
674 case T_VOID:
675 // halves of T_LONG or T_DOUBLE
676 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
677 regs[i].set_bad();
678 break;
679 case T_LONG:
680 assert(sig_bt[i + 1] == T_VOID, "expecting half");
681 // fall through
682 case T_OBJECT:
683 case T_ARRAY:
684 case T_ADDRESS:
685 case T_METADATA:
686 if (int_args < Argument::n_int_register_parameters_j+1) {
687 regs[i].set2(INT_ArgReg[int_args]->as_VMReg());
688 int_args++;
689 } else {
690 return -1;
691 }
692 break;
693 case T_FLOAT:
694 if (fp_args < Argument::n_float_register_parameters_j) {
695 regs[i].set1(FP_ArgReg[fp_args]->as_VMReg());
696 fp_args++;
697 } else {
698 return -1;
699 }
700 break;
701 case T_DOUBLE:
702 assert(sig_bt[i + 1] == T_VOID, "expecting half");
703 if (fp_args < Argument::n_float_register_parameters_j) {
704 regs[i].set2(FP_ArgReg[fp_args]->as_VMReg());
705 fp_args++;
706 } else {
707 return -1;
708 }
709 break;
710 default:
711 ShouldNotReachHere();
712 break;
713 }
714 }
715
716 return int_args + fp_args;
717 }
718
719 // Patch the callers callsite with entry to compiled code if it exists.
720 static void patch_callers_callsite(MacroAssembler *masm) {
721 Label L;
722 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
723 __ jcc(Assembler::equal, L);
724
725 // Save the current stack pointer
726 __ mov(r13, rsp);
727 // Schedule the branch target address early.
728 // Call into the VM to patch the caller, then jump to compiled callee
729 // rax isn't live so capture return address while we easily can
730 __ movptr(rax, Address(rsp, 0));
731
732 // align stack so push_CPU_state doesn't fault
733 __ andptr(rsp, -(StackAlignmentInBytes));
734 __ push_CPU_state();
735 __ vzeroupper();
736 // VM needs caller's callsite
737 // VM needs target method
738 // This needs to be a long call since we will relocate this adapter to
739 // the codeBuffer and it may not reach
740
741 // Allocate argument register save area
742 if (frame::arg_reg_save_area_bytes != 0) {
743 __ subptr(rsp, frame::arg_reg_save_area_bytes);
744 }
745 __ mov(c_rarg0, rbx);
746 __ mov(c_rarg1, rax);
747 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
748
749 // De-allocate argument register save area
750 if (frame::arg_reg_save_area_bytes != 0) {
751 __ addptr(rsp, frame::arg_reg_save_area_bytes);
752 }
753
754 __ vzeroupper();
755 __ pop_CPU_state();
756 // restore sp
757 __ mov(rsp, r13);
758 __ bind(L);
759 }
760
761 // For each inline type argument, sig includes the list of fields of
762 // the inline type. This utility function computes the number of
763 // arguments for the call if inline types are passed by reference (the
764 // calling convention the interpreter expects).
765 static int compute_total_args_passed_int(const GrowableArray<SigEntry>* sig_extended) {
766 int total_args_passed = 0;
767 if (InlineTypePassFieldsAsArgs) {
768 for (int i = 0; i < sig_extended->length(); i++) {
769 BasicType bt = sig_extended->at(i)._bt;
770 if (bt == T_METADATA) {
771 // In sig_extended, an inline type argument starts with:
772 // T_METADATA, followed by the types of the fields of the
773 // inline type and T_VOID to mark the end of the value
774 // type. Inline types are flattened so, for instance, in the
775 // case of an inline type with an int field and an inline type
776 // field that itself has 2 fields, an int and a long:
777 // T_METADATA T_INT T_METADATA T_INT T_LONG T_VOID (second
778 // slot for the T_LONG) T_VOID (inner inline type) T_VOID
779 // (outer inline type)
780 total_args_passed++;
781 int vt = 1;
782 do {
783 i++;
784 BasicType bt = sig_extended->at(i)._bt;
785 BasicType prev_bt = sig_extended->at(i-1)._bt;
786 if (bt == T_METADATA) {
787 vt++;
788 } else if (bt == T_VOID &&
789 prev_bt != T_LONG &&
790 prev_bt != T_DOUBLE) {
791 vt--;
792 }
793 } while (vt != 0);
794 } else {
795 total_args_passed++;
796 }
797 }
798 } else {
799 total_args_passed = sig_extended->length();
800 }
801 return total_args_passed;
802 }
803
804
805 static void gen_c2i_adapter_helper(MacroAssembler* masm,
806 BasicType bt,
807 BasicType prev_bt,
808 size_t size_in_bytes,
809 const VMRegPair& reg_pair,
810 const Address& to,
811 int extraspace,
812 bool is_oop) {
813 if (bt == T_VOID) {
814 assert(prev_bt == T_LONG || prev_bt == T_DOUBLE, "missing half");
815 return;
816 }
817
818 // Say 4 args:
819 // i st_off
820 // 0 32 T_LONG
821 // 1 24 T_VOID
822 // 2 16 T_OBJECT
823 // 3 8 T_BOOL
824 // - 0 return address
825 //
826 // However to make thing extra confusing. Because we can fit a long/double in
827 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
828 // leaves one slot empty and only stores to a single slot. In this case the
829 // slot that is occupied is the T_VOID slot. See I said it was confusing.
830
831 bool wide = (size_in_bytes == wordSize);
832 VMReg r_1 = reg_pair.first();
833 VMReg r_2 = reg_pair.second();
834 assert(r_2->is_valid() == wide, "invalid size");
835 if (!r_1->is_valid()) {
836 assert(!r_2->is_valid(), "must be invalid");
837 return;
838 }
839
840 if (!r_1->is_XMMRegister()) {
841 Register val = rax;
842 if (r_1->is_stack()) {
843 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
844 __ load_sized_value(val, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false);
845 } else {
846 val = r_1->as_Register();
847 }
848 assert_different_registers(to.base(), val, rscratch1);
849 if (is_oop) {
850 __ push(r13);
851 __ push(rbx);
852 // store_heap_oop transitively calls oop_store_at which corrupts to.base(). We need to keep it valid.
853 __ push(to.base());
854 __ store_heap_oop(to, val, rscratch1, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
855 __ pop(to.base());
856 __ pop(rbx);
857 __ pop(r13);
858 } else {
859 __ store_sized_value(to, val, size_in_bytes);
860 }
861 } else {
862 if (wide) {
863 __ movdbl(to, r_1->as_XMMRegister());
864 } else {
865 __ movflt(to, r_1->as_XMMRegister());
866 }
867 }
868 }
869
870 static void gen_c2i_adapter(MacroAssembler *masm,
871 const GrowableArray<SigEntry>* sig_extended,
872 const VMRegPair *regs,
873 bool requires_clinit_barrier,
874 address& c2i_no_clinit_check_entry,
875 Label& skip_fixup,
876 address start,
877 OopMapSet* oop_maps,
878 int& frame_complete,
879 int& frame_size_in_words,
880 bool alloc_inline_receiver) {
881 if (requires_clinit_barrier) {
882 assert(VM_Version::supports_fast_class_init_checks(), "sanity");
883 Label L_skip_barrier;
884 Register method = rbx;
885
886 { // Bypass the barrier for non-static methods
887 Register flags = rscratch1;
888 __ load_unsigned_short(flags, Address(method, Method::access_flags_offset()));
889 __ testl(flags, JVM_ACC_STATIC);
890 __ jcc(Assembler::zero, L_skip_barrier); // non-static
891 }
892
893 Register klass = rscratch1;
894 __ load_method_holder(klass, method);
895 __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
896
897 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
898
899 __ bind(L_skip_barrier);
900 c2i_no_clinit_check_entry = __ pc();
901 }
902
903 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
904 bs->c2i_entry_barrier(masm);
905
906 // Before we get into the guts of the C2I adapter, see if we should be here
907 // at all. We've come from compiled code and are attempting to jump to the
908 // interpreter, which means the caller made a static call to get here
909 // (vcalls always get a compiled target if there is one). Check for a
910 // compiled target. If there is one, we need to patch the caller's call.
911 patch_callers_callsite(masm);
912
913 __ bind(skip_fixup);
914
915 if (InlineTypePassFieldsAsArgs) {
916 // Is there an inline type argument?
917 bool has_inline_argument = false;
918 for (int i = 0; i < sig_extended->length() && !has_inline_argument; i++) {
919 has_inline_argument = (sig_extended->at(i)._bt == T_METADATA);
920 }
921 if (has_inline_argument) {
922 // There is at least an inline type argument: we're coming from
923 // compiled code so we have no buffers to back the inline types.
924 // Allocate the buffers here with a runtime call.
925 OopMap* map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
926
927 frame_complete = __ offset();
928
929 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
930
931 __ mov(c_rarg0, r15_thread);
932 __ mov(c_rarg1, rbx);
933 __ mov64(c_rarg2, (int64_t)alloc_inline_receiver);
934 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::allocate_inline_types)));
935
936 oop_maps->add_gc_map((int)(__ pc() - start), map);
937 __ reset_last_Java_frame(false);
938
939 RegisterSaver::restore_live_registers(masm);
940
941 Label no_exception;
942 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
943 __ jcc(Assembler::equal, no_exception);
944
945 __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
946 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
947 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
948
949 __ bind(no_exception);
950
951 // We get an array of objects from the runtime call
952 __ get_vm_result_oop(rscratch2); // Use rscratch2 (r11) as temporary because rscratch1 (r10) is trashed by movptr()
953 __ get_vm_result_metadata(rbx); // TODO: required to keep the callee Method live?
954 }
955 }
956
957 // Since all args are passed on the stack, total_args_passed *
958 // Interpreter::stackElementSize is the space we need.
959 int total_args_passed = compute_total_args_passed_int(sig_extended);
960 assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
961
962 int extraspace = (total_args_passed * Interpreter::stackElementSize);
963
964 // stack is aligned, keep it that way
965 // This is not currently needed or enforced by the interpreter, but
966 // we might as well conform to the ABI.
967 extraspace = align_up(extraspace, 2*wordSize);
968
969 // set senderSP value
970 __ lea(r13, Address(rsp, wordSize));
971
972 #ifdef ASSERT
973 __ check_stack_alignment(r13, "sender stack not aligned");
974 #endif
975 if (extraspace > 0) {
976 // Pop the return address
977 __ pop(rax);
978
979 __ subptr(rsp, extraspace);
980
981 // Push the return address
982 __ push(rax);
983
984 // Account for the return address location since we store it first rather
985 // than hold it in a register across all the shuffling
986 extraspace += wordSize;
987 }
988
989 #ifdef ASSERT
990 __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
991 #endif
992
993 // Now write the args into the outgoing interpreter space
994
995 // next_arg_comp is the next argument from the compiler point of
996 // view (inline type fields are passed in registers/on the stack). In
997 // sig_extended, an inline type argument starts with: T_METADATA,
998 // followed by the types of the fields of the inline type and T_VOID
999 // to mark the end of the inline type. ignored counts the number of
1000 // T_METADATA/T_VOID. next_vt_arg is the next inline type argument:
1001 // used to get the buffer for that argument from the pool of buffers
1002 // we allocated above and want to pass to the
1003 // interpreter. next_arg_int is the next argument from the
1004 // interpreter point of view (inline types are passed by reference).
1005 for (int next_arg_comp = 0, ignored = 0, next_vt_arg = 0, next_arg_int = 0;
1006 next_arg_comp < sig_extended->length(); next_arg_comp++) {
1007 assert(ignored <= next_arg_comp, "shouldn't skip over more slots than there are arguments");
1008 assert(next_arg_int <= total_args_passed, "more arguments for the interpreter than expected?");
1009 BasicType bt = sig_extended->at(next_arg_comp)._bt;
1010 int st_off = (total_args_passed - next_arg_int) * Interpreter::stackElementSize;
1011 if (!InlineTypePassFieldsAsArgs || bt != T_METADATA) {
1012 int next_off = st_off - Interpreter::stackElementSize;
1013 const int offset = (bt == T_LONG || bt == T_DOUBLE) ? next_off : st_off;
1014 const VMRegPair reg_pair = regs[next_arg_comp-ignored];
1015 size_t size_in_bytes = reg_pair.second()->is_valid() ? 8 : 4;
1016 gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
1017 size_in_bytes, reg_pair, Address(rsp, offset), extraspace, false);
1018 next_arg_int++;
1019 #ifdef ASSERT
1020 if (bt == T_LONG || bt == T_DOUBLE) {
1021 // Overwrite the unused slot with known junk
1022 __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
1023 __ movptr(Address(rsp, st_off), rax);
1024 }
1025 #endif /* ASSERT */
1026 } else {
1027 ignored++;
1028 // get the buffer from the just allocated pool of buffers
1029 int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + next_vt_arg * type2aelembytes(T_OBJECT);
1030 __ load_heap_oop(r14, Address(rscratch2, index));
1031 next_vt_arg++; next_arg_int++;
1032 int vt = 1;
1033 // write fields we get from compiled code in registers/stack
1034 // slots to the buffer: we know we are done with that inline type
1035 // argument when we hit the T_VOID that acts as an end of inline
1036 // type delimiter for this inline type. Inline types are flattened
1037 // so we might encounter embedded inline types. Each entry in
1038 // sig_extended contains a field offset in the buffer.
1039 Label L_null;
1040 do {
1041 next_arg_comp++;
1042 BasicType bt = sig_extended->at(next_arg_comp)._bt;
1043 BasicType prev_bt = sig_extended->at(next_arg_comp-1)._bt;
1044 if (bt == T_METADATA) {
1045 vt++;
1046 ignored++;
1047 } else if (bt == T_VOID &&
1048 prev_bt != T_LONG &&
1049 prev_bt != T_DOUBLE) {
1050 vt--;
1051 ignored++;
1052 } else {
1053 int off = sig_extended->at(next_arg_comp)._offset;
1054 if (off == -1) {
1055 // Nullable inline type argument, emit null check
1056 VMReg reg = regs[next_arg_comp-ignored].first();
1057 Label L_notNull;
1058 if (reg->is_stack()) {
1059 int ld_off = reg->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
1060 __ testb(Address(rsp, ld_off), 1);
1061 } else {
1062 __ testb(reg->as_Register(), 1);
1063 }
1064 __ jcc(Assembler::notZero, L_notNull);
1065 __ movptr(Address(rsp, st_off), 0);
1066 __ jmp(L_null);
1067 __ bind(L_notNull);
1068 continue;
1069 }
1070 assert(off > 0, "offset in object should be positive");
1071 size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize;
1072 bool is_oop = is_reference_type(bt);
1073 gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
1074 size_in_bytes, regs[next_arg_comp-ignored], Address(r14, off), extraspace, is_oop);
1075 }
1076 } while (vt != 0);
1077 // pass the buffer to the interpreter
1078 __ movptr(Address(rsp, st_off), r14);
1079 __ bind(L_null);
1080 }
1081 }
1082
1083 // Schedule the branch target address early.
1084 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
1085 __ jmp(rcx);
1086 }
1087
1088 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
1089 int comp_args_on_stack,
1090 const GrowableArray<SigEntry>* sig,
1091 const VMRegPair *regs) {
1092
1093 // Note: r13 contains the senderSP on entry. We must preserve it since
1094 // we may do a i2c -> c2i transition if we lose a race where compiled
1095 // code goes non-entrant while we get args ready.
1096 // In addition we use r13 to locate all the interpreter args as
1097 // we must align the stack to 16 bytes on an i2c entry else we
1098 // lose alignment we expect in all compiled code and register
1099 // save code can segv when fxsave instructions find improperly
1100 // aligned stack pointer.
1101
1102 // Adapters can be frameless because they do not require the caller
1103 // to perform additional cleanup work, such as correcting the stack pointer.
1104 // An i2c adapter is frameless because the *caller* frame, which is interpreted,
1105 // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
1106 // even if a callee has modified the stack pointer.
1107 // A c2i adapter is frameless because the *callee* frame, which is interpreted,
1108 // routinely repairs its caller's stack pointer (from sender_sp, which is set
1109 // up via the senderSP register).
1110 // In other words, if *either* the caller or callee is interpreted, we can
1111 // get the stack pointer repaired after a call.
1112 // This is why c2i and i2c adapters cannot be indefinitely composed.
1113 // In particular, if a c2i adapter were to somehow call an i2c adapter,
1114 // both caller and callee would be compiled methods, and neither would
1115 // clean up the stack pointer changes performed by the two adapters.
1116 // If this happens, control eventually transfers back to the compiled
1117 // caller, but with an uncorrected stack, causing delayed havoc.
1118
1119 // Must preserve original SP for loading incoming arguments because
1120 // we need to align the outgoing SP for compiled code.
1121 __ movptr(r11, rsp);
1122
1123 // Pick up the return address
1124 __ pop(rax);
1125
1126 // Convert 4-byte c2 stack slots to words.
1127 int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
1128
1129 if (comp_args_on_stack) {
1130 __ subptr(rsp, comp_words_on_stack * wordSize);
1131 }
1132
1133 // Ensure compiled code always sees stack at proper alignment
1134 __ andptr(rsp, -16);
1135
1136 // push the return address and misalign the stack that youngest frame always sees
1137 // as far as the placement of the call instruction
1138 __ push(rax);
1139
1140 // Put saved SP in another register
1141 const Register saved_sp = rax;
1142 __ movptr(saved_sp, r11);
1143
1144 // Will jump to the compiled code just as if compiled code was doing it.
1145 // Pre-load the register-jump target early, to schedule it better.
1146 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_inline_offset())));
1147
1148 #if INCLUDE_JVMCI
1149 if (EnableJVMCI) {
1150 // check if this call should be routed towards a specific entry point
1151 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1152 Label no_alternative_target;
1153 __ jcc(Assembler::equal, no_alternative_target);
1154 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
1155 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1156 __ bind(no_alternative_target);
1157 }
1158 #endif // INCLUDE_JVMCI
1159
1160 int total_args_passed = sig->length();
1161
1162 // Now generate the shuffle code. Pick up all register args and move the
1163 // rest through the floating point stack top.
1164 for (int i = 0; i < total_args_passed; i++) {
1165 BasicType bt = sig->at(i)._bt;
1166 if (bt == T_VOID) {
1167 // Longs and doubles are passed in native word order, but misaligned
1168 // in the 32-bit build.
1169 BasicType prev_bt = (i > 0) ? sig->at(i-1)._bt : T_ILLEGAL;
1170 assert(i > 0 && (prev_bt == T_LONG || prev_bt == T_DOUBLE), "missing half");
1171 continue;
1172 }
1173
1174 // Pick up 0, 1 or 2 words from SP+offset.
1175
1176 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
1177 "scrambled load targets?");
1178 // Load in argument order going down.
1179 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
1180 // Point to interpreter value (vs. tag)
1181 int next_off = ld_off - Interpreter::stackElementSize;
1182 //
1183 //
1184 //
1185 VMReg r_1 = regs[i].first();
1186 VMReg r_2 = regs[i].second();
1187 if (!r_1->is_valid()) {
1188 assert(!r_2->is_valid(), "");
1189 continue;
1190 }
1191 if (r_1->is_stack()) {
1192 // Convert stack slot to an SP offset (+ wordSize to account for return address )
1193 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
1194
1195 // We can use r13 as a temp here because compiled code doesn't need r13 as an input
1196 // and if we end up going thru a c2i because of a miss a reasonable value of r13
1197 // will be generated.
1198 if (!r_2->is_valid()) {
1199 // sign extend???
1200 __ movl(r13, Address(saved_sp, ld_off));
1201 __ movptr(Address(rsp, st_off), r13);
1202 } else {
1203 //
1204 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1205 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1206 // So we must adjust where to pick up the data to match the interpreter.
1207 //
1208 // Interpreter local[n] == MSW, local[n+1] == LSW however locals
1209 // are accessed as negative so LSW is at LOW address
1210
1211 // ld_off is MSW so get LSW
1212 const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1213 next_off : ld_off;
1214 __ movq(r13, Address(saved_sp, offset));
1215 // st_off is LSW (i.e. reg.first())
1216 __ movq(Address(rsp, st_off), r13);
1217 }
1218 } else if (r_1->is_Register()) { // Register argument
1219 Register r = r_1->as_Register();
1220 assert(r != rax, "must be different");
1221 if (r_2->is_valid()) {
1222 //
1223 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1224 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1225 // So we must adjust where to pick up the data to match the interpreter.
1226
1227 const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1228 next_off : ld_off;
1229
1230 // this can be a misaligned move
1231 __ movq(r, Address(saved_sp, offset));
1232 } else {
1233 // sign extend and use a full word?
1234 __ movl(r, Address(saved_sp, ld_off));
1235 }
1236 } else {
1237 if (!r_2->is_valid()) {
1238 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
1239 } else {
1240 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
1241 }
1242 }
1243 }
1244
1245 __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
1246
1247 // 6243940 We might end up in handle_wrong_method if
1248 // the callee is deoptimized as we race thru here. If that
1249 // happens we don't want to take a safepoint because the
1250 // caller frame will look interpreted and arguments are now
1251 // "compiled" so it is much better to make this transition
1252 // invisible to the stack walking code. Unfortunately if
1253 // we try and find the callee by normal means a safepoint
1254 // is possible. So we stash the desired callee in the thread
1255 // and the vm will find there should this case occur.
1256
1257 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1258
1259 // put Method* where a c2i would expect should we end up there
1260 // only needed because of c2 resolve stubs return Method* as a result in
1261 // rax
1262 __ mov(rax, rbx);
1263 __ jmp(r11);
1264 }
1265
1266 static void gen_inline_cache_check(MacroAssembler *masm, Label& skip_fixup) {
1267 Register data = rax;
1268 __ ic_check(1 /* end_alignment */);
1269 __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1270
1271 // Method might have been compiled since the call site was patched to
1272 // interpreted if that is the case treat it as a miss so we can get
1273 // the call site corrected.
1274 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1275 __ jcc(Assembler::equal, skip_fixup);
1276 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1277 }
1278
1279 // ---------------------------------------------------------------
1280 void SharedRuntime::generate_i2c2i_adapters(MacroAssembler* masm,
1281 int comp_args_on_stack,
1282 const GrowableArray<SigEntry>* sig,
1283 const VMRegPair* regs,
1284 const GrowableArray<SigEntry>* sig_cc,
1285 const VMRegPair* regs_cc,
1286 const GrowableArray<SigEntry>* sig_cc_ro,
1287 const VMRegPair* regs_cc_ro,
1288 address entry_address[AdapterBlob::ENTRY_COUNT],
1289 AdapterBlob*& new_adapter,
1290 bool allocate_code_blob) {
1291 entry_address[AdapterBlob::I2C] = __ pc();
1292 gen_i2c_adapter(masm, comp_args_on_stack, sig, regs);
1293
1294 // -------------------------------------------------------------------------
1295 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls
1296 // to the interpreter. The args start out packed in the compiled layout. They
1297 // need to be unpacked into the interpreter layout. This will almost always
1298 // require some stack space. We grow the current (compiled) stack, then repack
1299 // the args. We finally end in a jump to the generic interpreter entry point.
1300 // On exit from the interpreter, the interpreter will restore our SP (lest the
1301 // compiled code, which relies solely on SP and not RBP, get sick).
1302
1303 entry_address[AdapterBlob::C2I_Unverified] = __ pc();
1304 entry_address[AdapterBlob::C2I_Unverified_Inline] = __ pc();
1305 Label skip_fixup;
1306
1307 gen_inline_cache_check(masm, skip_fixup);
1308
1309 OopMapSet* oop_maps = new OopMapSet();
1310 int frame_complete = CodeOffsets::frame_never_safe;
1311 int frame_size_in_words = 0;
1312
1313 // Scalarized c2i adapter with non-scalarized receiver (i.e., don't pack receiver)
1314 entry_address[AdapterBlob::C2I_No_Clinit_Check] = nullptr;
1315 entry_address[AdapterBlob::C2I_Inline_RO] = __ pc();
1316 if (regs_cc != regs_cc_ro) {
1317 // No class init barrier needed because method is guaranteed to be non-static
1318 gen_c2i_adapter(masm, sig_cc_ro, regs_cc_ro, /* requires_clinit_barrier = */ false, entry_address[AdapterBlob::C2I_No_Clinit_Check],
1319 skip_fixup, entry_address[AdapterBlob::I2C], oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1320 skip_fixup.reset();
1321 }
1322
1323 // Scalarized c2i adapter
1324 entry_address[AdapterBlob::C2I] = __ pc();
1325 entry_address[AdapterBlob::C2I_Inline] = __ pc();
1326 gen_c2i_adapter(masm, sig_cc, regs_cc, /* requires_clinit_barrier = */ true, entry_address[AdapterBlob::C2I_No_Clinit_Check],
1327 skip_fixup, entry_address[AdapterBlob::I2C], oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ true);
1328
1329 // Non-scalarized c2i adapter
1330 if (regs != regs_cc) {
1331 entry_address[AdapterBlob::C2I_Unverified_Inline] = __ pc();
1332 Label inline_entry_skip_fixup;
1333 gen_inline_cache_check(masm, inline_entry_skip_fixup);
1334
1335 entry_address[AdapterBlob::C2I_Inline] = __ pc();
1336 gen_c2i_adapter(masm, sig, regs, /* requires_clinit_barrier = */ true, entry_address[AdapterBlob::C2I_No_Clinit_Check],
1337 inline_entry_skip_fixup, entry_address[AdapterBlob::I2C], oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1338 }
1339
1340 // The c2i adapters might safepoint and trigger a GC. The caller must make sure that
1341 // the GC knows about the location of oop argument locations passed to the c2i adapter.
1342 if (allocate_code_blob) {
1343 bool caller_must_gc_arguments = (regs != regs_cc);
1344 int entry_offset[AdapterHandlerEntry::ENTRIES_COUNT];
1345 assert(AdapterHandlerEntry::ENTRIES_COUNT == 7, "sanity");
1346 AdapterHandlerLibrary::address_to_offset(entry_address, entry_offset);
1347 new_adapter = AdapterBlob::create(masm->code(), entry_offset, frame_complete, frame_size_in_words, oop_maps, caller_must_gc_arguments);
1348 }
1349 }
1350
1351 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1352 VMRegPair *regs,
1353 int total_args_passed) {
1354
1355 // We return the amount of VMRegImpl stack slots we need to reserve for all
1356 // the arguments NOT counting out_preserve_stack_slots.
1357
1358 // NOTE: These arrays will have to change when c1 is ported
1359 #ifdef _WIN64
1360 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1361 c_rarg0, c_rarg1, c_rarg2, c_rarg3
1362 };
1363 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1364 c_farg0, c_farg1, c_farg2, c_farg3
1365 };
1366 #else
1367 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1368 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1369 };
1370 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1371 c_farg0, c_farg1, c_farg2, c_farg3,
1372 c_farg4, c_farg5, c_farg6, c_farg7
1373 };
1374 #endif // _WIN64
1375
1376
1377 uint int_args = 0;
1378 uint fp_args = 0;
1379 uint stk_args = 0; // inc by 2 each time
1380
1381 for (int i = 0; i < total_args_passed; i++) {
1382 switch (sig_bt[i]) {
1383 case T_BOOLEAN:
1384 case T_CHAR:
1385 case T_BYTE:
1386 case T_SHORT:
1387 case T_INT:
1388 if (int_args < Argument::n_int_register_parameters_c) {
1389 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1390 #ifdef _WIN64
1391 fp_args++;
1392 // Allocate slots for callee to stuff register args the stack.
1393 stk_args += 2;
1394 #endif
1395 } else {
1396 regs[i].set1(VMRegImpl::stack2reg(stk_args));
1397 stk_args += 2;
1398 }
1399 break;
1400 case T_LONG:
1401 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1402 // fall through
1403 case T_OBJECT:
1404 case T_ARRAY:
1405 case T_ADDRESS:
1406 case T_METADATA:
1407 if (int_args < Argument::n_int_register_parameters_c) {
1408 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1409 #ifdef _WIN64
1410 fp_args++;
1411 stk_args += 2;
1412 #endif
1413 } else {
1414 regs[i].set2(VMRegImpl::stack2reg(stk_args));
1415 stk_args += 2;
1416 }
1417 break;
1418 case T_FLOAT:
1419 if (fp_args < Argument::n_float_register_parameters_c) {
1420 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1421 #ifdef _WIN64
1422 int_args++;
1423 // Allocate slots for callee to stuff register args the stack.
1424 stk_args += 2;
1425 #endif
1426 } else {
1427 regs[i].set1(VMRegImpl::stack2reg(stk_args));
1428 stk_args += 2;
1429 }
1430 break;
1431 case T_DOUBLE:
1432 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1433 if (fp_args < Argument::n_float_register_parameters_c) {
1434 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1435 #ifdef _WIN64
1436 int_args++;
1437 // Allocate slots for callee to stuff register args the stack.
1438 stk_args += 2;
1439 #endif
1440 } else {
1441 regs[i].set2(VMRegImpl::stack2reg(stk_args));
1442 stk_args += 2;
1443 }
1444 break;
1445 case T_VOID: // Halves of longs and doubles
1446 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1447 regs[i].set_bad();
1448 break;
1449 default:
1450 ShouldNotReachHere();
1451 break;
1452 }
1453 }
1454 #ifdef _WIN64
1455 // windows abi requires that we always allocate enough stack space
1456 // for 4 64bit registers to be stored down.
1457 if (stk_args < 8) {
1458 stk_args = 8;
1459 }
1460 #endif // _WIN64
1461
1462 return stk_args;
1463 }
1464
1465 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1466 uint num_bits,
1467 uint total_args_passed) {
1468 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1469 "only certain vector sizes are supported for now");
1470
1471 static const XMMRegister VEC_ArgReg[32] = {
1472 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
1473 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1474 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1475 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1476 };
1477
1478 uint stk_args = 0;
1479 uint fp_args = 0;
1480
1481 for (uint i = 0; i < total_args_passed; i++) {
1482 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1483 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15));
1484 regs[i].set_pair(vmreg->next(next_val), vmreg);
1485 }
1486
1487 return stk_args;
1488 }
1489
1490 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1491 // We always ignore the frame_slots arg and just use the space just below frame pointer
1492 // which by this time is free to use
1493 switch (ret_type) {
1494 case T_FLOAT:
1495 __ movflt(Address(rbp, -wordSize), xmm0);
1496 break;
1497 case T_DOUBLE:
1498 __ movdbl(Address(rbp, -wordSize), xmm0);
1499 break;
1500 case T_VOID: break;
1501 default: {
1502 __ movptr(Address(rbp, -wordSize), rax);
1503 }
1504 }
1505 }
1506
1507 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1508 // We always ignore the frame_slots arg and just use the space just below frame pointer
1509 // which by this time is free to use
1510 switch (ret_type) {
1511 case T_FLOAT:
1512 __ movflt(xmm0, Address(rbp, -wordSize));
1513 break;
1514 case T_DOUBLE:
1515 __ movdbl(xmm0, Address(rbp, -wordSize));
1516 break;
1517 case T_VOID: break;
1518 default: {
1519 __ movptr(rax, Address(rbp, -wordSize));
1520 }
1521 }
1522 }
1523
1524 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1525 for ( int i = first_arg ; i < arg_count ; i++ ) {
1526 if (args[i].first()->is_Register()) {
1527 __ push(args[i].first()->as_Register());
1528 } else if (args[i].first()->is_XMMRegister()) {
1529 __ subptr(rsp, 2*wordSize);
1530 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1531 }
1532 }
1533 }
1534
1535 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1536 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1537 if (args[i].first()->is_Register()) {
1538 __ pop(args[i].first()->as_Register());
1539 } else if (args[i].first()->is_XMMRegister()) {
1540 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1541 __ addptr(rsp, 2*wordSize);
1542 }
1543 }
1544 }
1545
1546 static void verify_oop_args(MacroAssembler* masm,
1547 const methodHandle& method,
1548 const BasicType* sig_bt,
1549 const VMRegPair* regs) {
1550 Register temp_reg = rbx; // not part of any compiled calling seq
1551 if (VerifyOops) {
1552 for (int i = 0; i < method->size_of_parameters(); i++) {
1553 if (is_reference_type(sig_bt[i])) {
1554 VMReg r = regs[i].first();
1555 assert(r->is_valid(), "bad oop arg");
1556 if (r->is_stack()) {
1557 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1558 __ verify_oop(temp_reg);
1559 } else {
1560 __ verify_oop(r->as_Register());
1561 }
1562 }
1563 }
1564 }
1565 }
1566
1567 static void check_continuation_enter_argument(VMReg actual_vmreg,
1568 Register expected_reg,
1569 const char* name) {
1570 assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1571 assert(actual_vmreg->as_Register() == expected_reg,
1572 "%s is in unexpected register: %s instead of %s",
1573 name, actual_vmreg->as_Register()->name(), expected_reg->name());
1574 }
1575
1576
1577 //---------------------------- continuation_enter_setup ---------------------------
1578 //
1579 // Arguments:
1580 // None.
1581 //
1582 // Results:
1583 // rsp: pointer to blank ContinuationEntry
1584 //
1585 // Kills:
1586 // rax
1587 //
1588 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1589 assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1590 assert(in_bytes(ContinuationEntry::cont_offset()) % VMRegImpl::stack_slot_size == 0, "");
1591 assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1592
1593 stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1594 __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1595
1596 int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1597 OopMap* map = new OopMap(frame_size, 0);
1598
1599 __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1600 __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1601 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1602
1603 return map;
1604 }
1605
1606 //---------------------------- fill_continuation_entry ---------------------------
1607 //
1608 // Arguments:
1609 // rsp: pointer to blank Continuation entry
1610 // reg_cont_obj: pointer to the continuation
1611 // reg_flags: flags
1612 //
1613 // Results:
1614 // rsp: pointer to filled out ContinuationEntry
1615 //
1616 // Kills:
1617 // rax
1618 //
1619 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1620 assert_different_registers(rax, reg_cont_obj, reg_flags);
1621 #ifdef ASSERT
1622 __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1623 #endif
1624 __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1625 __ movl (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1626 __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1627 __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1628 __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1629
1630 __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1631 __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1632
1633 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1634 }
1635
1636 //---------------------------- continuation_enter_cleanup ---------------------------
1637 //
1638 // Arguments:
1639 // rsp: pointer to the ContinuationEntry
1640 //
1641 // Results:
1642 // rsp: pointer to the spilled rbp in the entry frame
1643 //
1644 // Kills:
1645 // rbx
1646 //
1647 static void continuation_enter_cleanup(MacroAssembler* masm) {
1648 #ifdef ASSERT
1649 Label L_good_sp;
1650 __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1651 __ jcc(Assembler::equal, L_good_sp);
1652 __ stop("Incorrect rsp at continuation_enter_cleanup");
1653 __ bind(L_good_sp);
1654 #endif
1655 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1656 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1657 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1658 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1659 __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1660 }
1661
1662 static void gen_continuation_enter(MacroAssembler* masm,
1663 const VMRegPair* regs,
1664 int& exception_offset,
1665 OopMapSet* oop_maps,
1666 int& frame_complete,
1667 int& stack_slots,
1668 int& interpreted_entry_offset,
1669 int& compiled_entry_offset) {
1670
1671 // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1672 int pos_cont_obj = 0;
1673 int pos_is_cont = 1;
1674 int pos_is_virtual = 2;
1675
1676 // The platform-specific calling convention may present the arguments in various registers.
1677 // To simplify the rest of the code, we expect the arguments to reside at these known
1678 // registers, and we additionally check the placement here in case calling convention ever
1679 // changes.
1680 Register reg_cont_obj = c_rarg1;
1681 Register reg_is_cont = c_rarg2;
1682 Register reg_is_virtual = c_rarg3;
1683
1684 check_continuation_enter_argument(regs[pos_cont_obj].first(), reg_cont_obj, "Continuation object");
1685 check_continuation_enter_argument(regs[pos_is_cont].first(), reg_is_cont, "isContinue");
1686 check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1687
1688 // Utility methods kill rax, make sure there are no collisions
1689 assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1690
1691 AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1692 relocInfo::static_call_type);
1693
1694 address start = __ pc();
1695
1696 Label L_thaw, L_exit;
1697
1698 // i2i entry used at interp_only_mode only
1699 interpreted_entry_offset = __ pc() - start;
1700 {
1701 #ifdef ASSERT
1702 Label is_interp_only;
1703 __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1704 __ jcc(Assembler::notEqual, is_interp_only);
1705 __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1706 __ bind(is_interp_only);
1707 #endif
1708
1709 __ pop(rax); // return address
1710 // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1711 __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1712 __ movl(c_rarg2, Address(rsp, Interpreter::stackElementSize*1));
1713 __ movl(c_rarg3, Address(rsp, Interpreter::stackElementSize*0));
1714 __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1715 __ push(rax); // return address
1716 __ push_cont_fastpath();
1717
1718 __ enter();
1719
1720 stack_slots = 2; // will be adjusted in setup
1721 OopMap* map = continuation_enter_setup(masm, stack_slots);
1722 // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1723 // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1724
1725 __ verify_oop(reg_cont_obj);
1726
1727 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1728
1729 // If continuation, call to thaw. Otherwise, resolve the call and exit.
1730 __ testptr(reg_is_cont, reg_is_cont);
1731 __ jcc(Assembler::notZero, L_thaw);
1732
1733 // --- Resolve path
1734
1735 // Make sure the call is patchable
1736 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1737 // Emit stub for static call
1738 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1739 if (stub == nullptr) {
1740 fatal("CodeCache is full at gen_continuation_enter");
1741 }
1742 __ call(resolve);
1743 oop_maps->add_gc_map(__ pc() - start, map);
1744 __ post_call_nop();
1745
1746 __ jmp(L_exit);
1747 }
1748
1749 // compiled entry
1750 __ align(CodeEntryAlignment);
1751 compiled_entry_offset = __ pc() - start;
1752 __ enter();
1753
1754 stack_slots = 2; // will be adjusted in setup
1755 OopMap* map = continuation_enter_setup(masm, stack_slots);
1756
1757 // Frame is now completed as far as size and linkage.
1758 frame_complete = __ pc() - start;
1759
1760 __ verify_oop(reg_cont_obj);
1761
1762 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1763
1764 // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1765 __ testptr(reg_is_cont, reg_is_cont);
1766 __ jccb(Assembler::notZero, L_thaw);
1767
1768 // --- call Continuation.enter(Continuation c, boolean isContinue)
1769
1770 // Make sure the call is patchable
1771 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1772
1773 // Emit stub for static call
1774 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1775 if (stub == nullptr) {
1776 fatal("CodeCache is full at gen_continuation_enter");
1777 }
1778
1779 // The call needs to be resolved. There's a special case for this in
1780 // SharedRuntime::find_callee_info_helper() which calls
1781 // LinkResolver::resolve_continuation_enter() which resolves the call to
1782 // Continuation.enter(Continuation c, boolean isContinue).
1783 __ call(resolve);
1784
1785 oop_maps->add_gc_map(__ pc() - start, map);
1786 __ post_call_nop();
1787
1788 __ jmpb(L_exit);
1789
1790 // --- Thawing path
1791
1792 __ bind(L_thaw);
1793
1794 ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1795 __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1796
1797 ContinuationEntry::_return_pc_offset = __ pc() - start;
1798 oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1799 __ post_call_nop();
1800
1801 // --- Normal exit (resolve/thawing)
1802
1803 __ bind(L_exit);
1804 ContinuationEntry::_cleanup_offset = __ pc() - start;
1805 continuation_enter_cleanup(masm);
1806 __ pop(rbp);
1807 __ ret(0);
1808
1809 // --- Exception handling path
1810
1811 exception_offset = __ pc() - start;
1812
1813 continuation_enter_cleanup(masm);
1814 __ pop(rbp);
1815
1816 __ movptr(c_rarg0, r15_thread);
1817 __ movptr(c_rarg1, Address(rsp, 0)); // return address
1818
1819 // rax still holds the original exception oop, save it before the call
1820 __ push(rax);
1821
1822 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1823 __ movptr(rbx, rax);
1824
1825 // Continue at exception handler:
1826 // rax: exception oop
1827 // rbx: exception handler
1828 // rdx: exception pc
1829 __ pop(rax);
1830 __ verify_oop(rax);
1831 __ pop(rdx);
1832 __ jmp(rbx);
1833 }
1834
1835 static void gen_continuation_yield(MacroAssembler* masm,
1836 const VMRegPair* regs,
1837 OopMapSet* oop_maps,
1838 int& frame_complete,
1839 int& stack_slots,
1840 int& compiled_entry_offset) {
1841 enum layout {
1842 rbp_off,
1843 rbpH_off,
1844 return_off,
1845 return_off2,
1846 framesize // inclusive of return address
1847 };
1848 stack_slots = framesize / VMRegImpl::slots_per_word;
1849 assert(stack_slots == 2, "recheck layout");
1850
1851 address start = __ pc();
1852 compiled_entry_offset = __ pc() - start;
1853 __ enter();
1854 address the_pc = __ pc();
1855
1856 frame_complete = the_pc - start;
1857
1858 // This nop must be exactly at the PC we push into the frame info.
1859 // We use this nop for fast CodeBlob lookup, associate the OopMap
1860 // with it right away.
1861 __ post_call_nop();
1862 OopMap* map = new OopMap(framesize, 1);
1863 oop_maps->add_gc_map(frame_complete, map);
1864
1865 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1866 __ movptr(c_rarg0, r15_thread);
1867 __ movptr(c_rarg1, rsp);
1868 __ call_VM_leaf(Continuation::freeze_entry(), 2);
1869 __ reset_last_Java_frame(true);
1870
1871 Label L_pinned;
1872
1873 __ testptr(rax, rax);
1874 __ jcc(Assembler::notZero, L_pinned);
1875
1876 __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1877 continuation_enter_cleanup(masm);
1878 __ pop(rbp);
1879 __ ret(0);
1880
1881 __ bind(L_pinned);
1882
1883 // Pinned, return to caller
1884
1885 // handle pending exception thrown by freeze
1886 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1887 Label ok;
1888 __ jcc(Assembler::equal, ok);
1889 __ leave();
1890 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1891 __ bind(ok);
1892
1893 __ leave();
1894 __ ret(0);
1895 }
1896
1897 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1898 ::continuation_enter_cleanup(masm);
1899 }
1900
1901 static void gen_special_dispatch(MacroAssembler* masm,
1902 const methodHandle& method,
1903 const BasicType* sig_bt,
1904 const VMRegPair* regs) {
1905 verify_oop_args(masm, method, sig_bt, regs);
1906 vmIntrinsics::ID iid = method->intrinsic_id();
1907
1908 // Now write the args into the outgoing interpreter space
1909 bool has_receiver = false;
1910 Register receiver_reg = noreg;
1911 int member_arg_pos = -1;
1912 Register member_reg = noreg;
1913 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1914 if (ref_kind != 0) {
1915 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument
1916 member_reg = rbx; // known to be free at this point
1917 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1918 } else if (iid == vmIntrinsics::_invokeBasic) {
1919 has_receiver = true;
1920 } else if (iid == vmIntrinsics::_linkToNative) {
1921 member_arg_pos = method->size_of_parameters() - 1; // trailing NativeEntryPoint argument
1922 member_reg = rbx; // known to be free at this point
1923 } else {
1924 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1925 }
1926
1927 if (member_reg != noreg) {
1928 // Load the member_arg into register, if necessary.
1929 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1930 VMReg r = regs[member_arg_pos].first();
1931 if (r->is_stack()) {
1932 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1933 } else {
1934 // no data motion is needed
1935 member_reg = r->as_Register();
1936 }
1937 }
1938
1939 if (has_receiver) {
1940 // Make sure the receiver is loaded into a register.
1941 assert(method->size_of_parameters() > 0, "oob");
1942 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1943 VMReg r = regs[0].first();
1944 assert(r->is_valid(), "bad receiver arg");
1945 if (r->is_stack()) {
1946 // Porting note: This assumes that compiled calling conventions always
1947 // pass the receiver oop in a register. If this is not true on some
1948 // platform, pick a temp and load the receiver from stack.
1949 fatal("receiver always in a register");
1950 receiver_reg = j_rarg0; // known to be free at this point
1951 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1952 } else {
1953 // no data motion is needed
1954 receiver_reg = r->as_Register();
1955 }
1956 }
1957
1958 // Figure out which address we are really jumping to:
1959 MethodHandles::generate_method_handle_dispatch(masm, iid,
1960 receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1961 }
1962
1963 // ---------------------------------------------------------------------------
1964 // Generate a native wrapper for a given method. The method takes arguments
1965 // in the Java compiled code convention, marshals them to the native
1966 // convention (handlizes oops, etc), transitions to native, makes the call,
1967 // returns to java state (possibly blocking), unhandlizes any result and
1968 // returns.
1969 //
1970 // Critical native functions are a shorthand for the use of
1971 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1972 // functions. The wrapper is expected to unpack the arguments before
1973 // passing them to the callee. Critical native functions leave the state _in_Java,
1974 // since they cannot stop for GC.
1975 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1976 // block and the check for pending exceptions it's impossible for them
1977 // to be thrown.
1978 //
1979 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1980 const methodHandle& method,
1981 int compile_id,
1982 BasicType* in_sig_bt,
1983 VMRegPair* in_regs,
1984 BasicType ret_type) {
1985 if (method->is_continuation_native_intrinsic()) {
1986 int exception_offset = -1;
1987 OopMapSet* oop_maps = new OopMapSet();
1988 int frame_complete = -1;
1989 int stack_slots = -1;
1990 int interpreted_entry_offset = -1;
1991 int vep_offset = -1;
1992 if (method->is_continuation_enter_intrinsic()) {
1993 gen_continuation_enter(masm,
1994 in_regs,
1995 exception_offset,
1996 oop_maps,
1997 frame_complete,
1998 stack_slots,
1999 interpreted_entry_offset,
2000 vep_offset);
2001 } else if (method->is_continuation_yield_intrinsic()) {
2002 gen_continuation_yield(masm,
2003 in_regs,
2004 oop_maps,
2005 frame_complete,
2006 stack_slots,
2007 vep_offset);
2008 } else {
2009 guarantee(false, "Unknown Continuation native intrinsic");
2010 }
2011
2012 #ifdef ASSERT
2013 if (method->is_continuation_enter_intrinsic()) {
2014 assert(interpreted_entry_offset != -1, "Must be set");
2015 assert(exception_offset != -1, "Must be set");
2016 } else {
2017 assert(interpreted_entry_offset == -1, "Must be unset");
2018 assert(exception_offset == -1, "Must be unset");
2019 }
2020 assert(frame_complete != -1, "Must be set");
2021 assert(stack_slots != -1, "Must be set");
2022 assert(vep_offset != -1, "Must be set");
2023 #endif
2024
2025 __ flush();
2026 nmethod* nm = nmethod::new_native_nmethod(method,
2027 compile_id,
2028 masm->code(),
2029 vep_offset,
2030 frame_complete,
2031 stack_slots,
2032 in_ByteSize(-1),
2033 in_ByteSize(-1),
2034 oop_maps,
2035 exception_offset);
2036 if (nm == nullptr) return nm;
2037 if (method->is_continuation_enter_intrinsic()) {
2038 ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
2039 } else if (method->is_continuation_yield_intrinsic()) {
2040 _cont_doYield_stub = nm;
2041 }
2042 return nm;
2043 }
2044
2045 if (method->is_method_handle_intrinsic()) {
2046 vmIntrinsics::ID iid = method->intrinsic_id();
2047 intptr_t start = (intptr_t)__ pc();
2048 int vep_offset = ((intptr_t)__ pc()) - start;
2049 gen_special_dispatch(masm,
2050 method,
2051 in_sig_bt,
2052 in_regs);
2053 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period
2054 __ flush();
2055 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually
2056 return nmethod::new_native_nmethod(method,
2057 compile_id,
2058 masm->code(),
2059 vep_offset,
2060 frame_complete,
2061 stack_slots / VMRegImpl::slots_per_word,
2062 in_ByteSize(-1),
2063 in_ByteSize(-1),
2064 nullptr);
2065 }
2066 address native_func = method->native_function();
2067 assert(native_func != nullptr, "must have function");
2068
2069 // An OopMap for lock (and class if static)
2070 OopMapSet *oop_maps = new OopMapSet();
2071 intptr_t start = (intptr_t)__ pc();
2072
2073 // We have received a description of where all the java arg are located
2074 // on entry to the wrapper. We need to convert these args to where
2075 // the jni function will expect them. To figure out where they go
2076 // we convert the java signature to a C signature by inserting
2077 // the hidden arguments as arg[0] and possibly arg[1] (static method)
2078
2079 const int total_in_args = method->size_of_parameters();
2080 int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
2081
2082 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
2083 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
2084
2085 int argc = 0;
2086 out_sig_bt[argc++] = T_ADDRESS;
2087 if (method->is_static()) {
2088 out_sig_bt[argc++] = T_OBJECT;
2089 }
2090
2091 for (int i = 0; i < total_in_args ; i++ ) {
2092 out_sig_bt[argc++] = in_sig_bt[i];
2093 }
2094
2095 // Now figure out where the args must be stored and how much stack space
2096 // they require.
2097 int out_arg_slots;
2098 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
2099
2100 // Compute framesize for the wrapper. We need to handlize all oops in
2101 // incoming registers
2102
2103 // Calculate the total number of stack slots we will need.
2104
2105 // First count the abi requirement plus all of the outgoing args
2106 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
2107
2108 // Now the space for the inbound oop handle area
2109 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers
2110
2111 int oop_handle_offset = stack_slots;
2112 stack_slots += total_save_slots;
2113
2114 // Now any space we need for handlizing a klass if static method
2115
2116 int klass_slot_offset = 0;
2117 int klass_offset = -1;
2118 int lock_slot_offset = 0;
2119 bool is_static = false;
2120
2121 if (method->is_static()) {
2122 klass_slot_offset = stack_slots;
2123 stack_slots += VMRegImpl::slots_per_word;
2124 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
2125 is_static = true;
2126 }
2127
2128 // Plus a lock if needed
2129
2130 if (method->is_synchronized()) {
2131 lock_slot_offset = stack_slots;
2132 stack_slots += VMRegImpl::slots_per_word;
2133 }
2134
2135 // Now a place (+2) to save return values or temp during shuffling
2136 // + 4 for return address (which we own) and saved rbp
2137 stack_slots += 6;
2138
2139 // Ok The space we have allocated will look like:
2140 //
2141 //
2142 // FP-> | |
2143 // |---------------------|
2144 // | 2 slots for moves |
2145 // |---------------------|
2146 // | lock box (if sync) |
2147 // |---------------------| <- lock_slot_offset
2148 // | klass (if static) |
2149 // |---------------------| <- klass_slot_offset
2150 // | oopHandle area |
2151 // |---------------------| <- oop_handle_offset (6 java arg registers)
2152 // | outbound memory |
2153 // | based arguments |
2154 // | |
2155 // |---------------------|
2156 // | |
2157 // SP-> | out_preserved_slots |
2158 //
2159 //
2160
2161
2162 // Now compute actual number of stack words we need rounding to make
2163 // stack properly aligned.
2164 stack_slots = align_up(stack_slots, StackAlignmentInSlots);
2165
2166 int stack_size = stack_slots * VMRegImpl::stack_slot_size;
2167
2168 // First thing make an ic check to see if we should even be here
2169
2170 // We are free to use all registers as temps without saving them and
2171 // restoring them except rbp. rbp is the only callee save register
2172 // as far as the interpreter and the compiler(s) are concerned.
2173
2174 const Register receiver = j_rarg0;
2175
2176 Label exception_pending;
2177
2178 assert_different_registers(receiver, rscratch1, rscratch2);
2179 __ verify_oop(receiver);
2180 __ ic_check(8 /* end_alignment */);
2181
2182 int vep_offset = ((intptr_t)__ pc()) - start;
2183
2184 if (method->needs_clinit_barrier()) {
2185 assert(VM_Version::supports_fast_class_init_checks(), "sanity");
2186 Label L_skip_barrier;
2187 Register klass = r10;
2188 __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
2189 __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
2190
2191 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
2192
2193 __ bind(L_skip_barrier);
2194 }
2195
2196 #ifdef COMPILER1
2197 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2198 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2199 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2200 }
2201 #endif // COMPILER1
2202
2203 // The instruction at the verified entry point must be 5 bytes or longer
2204 // because it can be patched on the fly by make_non_entrant. The stack bang
2205 // instruction fits that requirement.
2206
2207 // Generate stack overflow check
2208 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
2209
2210 // Generate a new frame for the wrapper.
2211 __ enter();
2212 // -2 because return address is already present and so is saved rbp
2213 __ subptr(rsp, stack_size - 2*wordSize);
2214
2215 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2216 // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
2217 bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
2218
2219 // Frame is now completed as far as size and linkage.
2220 int frame_complete = ((intptr_t)__ pc()) - start;
2221
2222 #ifdef ASSERT
2223 __ check_stack_alignment(rsp, "improperly aligned stack");
2224 #endif /* ASSERT */
2225
2226
2227 // We use r14 as the oop handle for the receiver/klass
2228 // It is callee save so it survives the call to native
2229
2230 const Register oop_handle_reg = r14;
2231
2232 //
2233 // We immediately shuffle the arguments so that any vm call we have to
2234 // make from here on out (sync slow path, jvmti, etc.) we will have
2235 // captured the oops from our caller and have a valid oopMap for
2236 // them.
2237
2238 // -----------------
2239 // The Grand Shuffle
2240
2241 // The Java calling convention is either equal (linux) or denser (win64) than the
2242 // c calling convention. However the because of the jni_env argument the c calling
2243 // convention always has at least one more (and two for static) arguments than Java.
2244 // Therefore if we move the args from java -> c backwards then we will never have
2245 // a register->register conflict and we don't have to build a dependency graph
2246 // and figure out how to break any cycles.
2247 //
2248
2249 // Record esp-based slot for receiver on stack for non-static methods
2250 int receiver_offset = -1;
2251
2252 // This is a trick. We double the stack slots so we can claim
2253 // the oops in the caller's frame. Since we are sure to have
2254 // more args than the caller doubling is enough to make
2255 // sure we can capture all the incoming oop args from the
2256 // caller.
2257 //
2258 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2259
2260 // Mark location of rbp (someday)
2261 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2262
2263 // Use eax, ebx as temporaries during any memory-memory moves we have to do
2264 // All inbound args are referenced based on rbp and all outbound args via rsp.
2265
2266
2267 #ifdef ASSERT
2268 bool reg_destroyed[Register::number_of_registers];
2269 bool freg_destroyed[XMMRegister::number_of_registers];
2270 for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2271 reg_destroyed[r] = false;
2272 }
2273 for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2274 freg_destroyed[f] = false;
2275 }
2276
2277 #endif /* ASSERT */
2278
2279 // For JNI natives the incoming and outgoing registers are offset upwards.
2280 GrowableArray<int> arg_order(2 * total_in_args);
2281
2282 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2283 arg_order.push(i);
2284 arg_order.push(c_arg);
2285 }
2286
2287 for (int ai = 0; ai < arg_order.length(); ai += 2) {
2288 int i = arg_order.at(ai);
2289 int c_arg = arg_order.at(ai + 1);
2290 __ block_comment(err_msg("move %d -> %d", i, c_arg));
2291 #ifdef ASSERT
2292 if (in_regs[i].first()->is_Register()) {
2293 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2294 } else if (in_regs[i].first()->is_XMMRegister()) {
2295 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2296 }
2297 if (out_regs[c_arg].first()->is_Register()) {
2298 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2299 } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2300 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2301 }
2302 #endif /* ASSERT */
2303 switch (in_sig_bt[i]) {
2304 case T_ARRAY:
2305 case T_OBJECT:
2306 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2307 ((i == 0) && (!is_static)),
2308 &receiver_offset);
2309 break;
2310 case T_VOID:
2311 break;
2312
2313 case T_FLOAT:
2314 __ float_move(in_regs[i], out_regs[c_arg]);
2315 break;
2316
2317 case T_DOUBLE:
2318 assert( i + 1 < total_in_args &&
2319 in_sig_bt[i + 1] == T_VOID &&
2320 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2321 __ double_move(in_regs[i], out_regs[c_arg]);
2322 break;
2323
2324 case T_LONG :
2325 __ long_move(in_regs[i], out_regs[c_arg]);
2326 break;
2327
2328 case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2329
2330 default:
2331 __ move32_64(in_regs[i], out_regs[c_arg]);
2332 }
2333 }
2334
2335 int c_arg;
2336
2337 // Pre-load a static method's oop into r14. Used both by locking code and
2338 // the normal JNI call code.
2339 // point c_arg at the first arg that is already loaded in case we
2340 // need to spill before we call out
2341 c_arg = total_c_args - total_in_args;
2342
2343 if (method->is_static()) {
2344
2345 // load oop into a register
2346 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2347
2348 // Now handlize the static class mirror it's known not-null.
2349 __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2350 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2351
2352 // Now get the handle
2353 __ lea(oop_handle_reg, Address(rsp, klass_offset));
2354 // store the klass handle as second argument
2355 __ movptr(c_rarg1, oop_handle_reg);
2356 // and protect the arg if we must spill
2357 c_arg--;
2358 }
2359
2360 // Change state to native (we save the return address in the thread, since it might not
2361 // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2362 // points into the right code segment. It does not have to be the correct return pc.
2363 // We use the same pc/oopMap repeatedly when we call out
2364
2365 Label native_return;
2366 if (method->is_object_wait0()) {
2367 // For convenience we use the pc we want to resume to in case of preemption on Object.wait.
2368 __ set_last_Java_frame(rsp, noreg, native_return, rscratch1);
2369 } else {
2370 intptr_t the_pc = (intptr_t) __ pc();
2371 oop_maps->add_gc_map(the_pc - start, map);
2372
2373 __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1);
2374 }
2375
2376 // We have all of the arguments setup at this point. We must not touch any register
2377 // argument registers at this point (what if we save/restore them there are no oop?
2378
2379 if (DTraceMethodProbes) {
2380 // protect the args we've loaded
2381 save_args(masm, total_c_args, c_arg, out_regs);
2382 __ mov_metadata(c_rarg1, method());
2383 __ call_VM_leaf(
2384 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2385 r15_thread, c_rarg1);
2386 restore_args(masm, total_c_args, c_arg, out_regs);
2387 }
2388
2389 // RedefineClasses() tracing support for obsolete method entry
2390 if (log_is_enabled(Trace, redefine, class, obsolete)) {
2391 // protect the args we've loaded
2392 save_args(masm, total_c_args, c_arg, out_regs);
2393 __ mov_metadata(c_rarg1, method());
2394 __ call_VM_leaf(
2395 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2396 r15_thread, c_rarg1);
2397 restore_args(masm, total_c_args, c_arg, out_regs);
2398 }
2399
2400 // Lock a synchronized method
2401
2402 // Register definitions used by locking and unlocking
2403
2404 const Register swap_reg = rax; // Must use rax for cmpxchg instruction
2405 const Register obj_reg = rbx; // Will contain the oop
2406 const Register lock_reg = r13; // Address of compiler lock object (BasicLock)
2407
2408 Label slow_path_lock;
2409 Label lock_done;
2410
2411 if (method->is_synchronized()) {
2412 // Get the handle (the 2nd argument)
2413 __ mov(oop_handle_reg, c_rarg1);
2414
2415 // Get address of the box
2416
2417 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2418
2419 // Load the oop from the handle
2420 __ movptr(obj_reg, Address(oop_handle_reg, 0));
2421
2422 __ fast_lock(lock_reg, obj_reg, swap_reg, rscratch1, slow_path_lock);
2423
2424 // Slow path will re-enter here
2425 __ bind(lock_done);
2426 }
2427
2428 // Finally just about ready to make the JNI call
2429
2430 // get JNIEnv* which is first argument to native
2431 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2432
2433 // Now set thread in native
2434 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2435
2436 __ call(RuntimeAddress(native_func));
2437
2438 // Verify or restore cpu control state after JNI call
2439 __ restore_cpu_control_state_after_jni(rscratch1);
2440
2441 // Unpack native results.
2442 switch (ret_type) {
2443 case T_BOOLEAN: __ c2bool(rax); break;
2444 case T_CHAR : __ movzwl(rax, rax); break;
2445 case T_BYTE : __ sign_extend_byte (rax); break;
2446 case T_SHORT : __ sign_extend_short(rax); break;
2447 case T_INT : /* nothing to do */ break;
2448 case T_DOUBLE :
2449 case T_FLOAT :
2450 // Result is in xmm0 we'll save as needed
2451 break;
2452 case T_ARRAY: // Really a handle
2453 case T_OBJECT: // Really a handle
2454 break; // can't de-handlize until after safepoint check
2455 case T_VOID: break;
2456 case T_LONG: break;
2457 default : ShouldNotReachHere();
2458 }
2459
2460 // Switch thread to "native transition" state before reading the synchronization state.
2461 // This additional state is necessary because reading and testing the synchronization
2462 // state is not atomic w.r.t. GC, as this scenario demonstrates:
2463 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2464 // VM thread changes sync state to synchronizing and suspends threads for GC.
2465 // Thread A is resumed to finish this native method, but doesn't block here since it
2466 // didn't see any synchronization is progress, and escapes.
2467 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2468
2469 // Force this write out before the read below
2470 if (!UseSystemMemoryBarrier) {
2471 __ membar(Assembler::Membar_mask_bits(
2472 Assembler::LoadLoad | Assembler::LoadStore |
2473 Assembler::StoreLoad | Assembler::StoreStore));
2474 }
2475
2476 // check for safepoint operation in progress and/or pending suspend requests
2477 {
2478 Label Continue;
2479 Label slow_path;
2480
2481 __ safepoint_poll(slow_path, true /* at_return */, false /* in_nmethod */);
2482
2483 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2484 __ jcc(Assembler::equal, Continue);
2485 __ bind(slow_path);
2486
2487 // Don't use call_VM as it will see a possible pending exception and forward it
2488 // and never return here preventing us from clearing _last_native_pc down below.
2489 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2490 // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2491 // by hand.
2492 //
2493 __ vzeroupper();
2494 save_native_result(masm, ret_type, stack_slots);
2495 __ mov(c_rarg0, r15_thread);
2496 __ mov(r12, rsp); // remember sp
2497 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2498 __ andptr(rsp, -16); // align stack as required by ABI
2499 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2500 __ mov(rsp, r12); // restore sp
2501 __ reinit_heapbase();
2502 // Restore any method result value
2503 restore_native_result(masm, ret_type, stack_slots);
2504 __ bind(Continue);
2505 }
2506
2507 // change thread state
2508 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2509
2510 if (method->is_object_wait0()) {
2511 // Check preemption for Object.wait()
2512 __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset()));
2513 __ cmpptr(rscratch1, NULL_WORD);
2514 __ jccb(Assembler::equal, native_return);
2515 __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD);
2516 __ jmp(rscratch1);
2517 __ bind(native_return);
2518
2519 intptr_t the_pc = (intptr_t) __ pc();
2520 oop_maps->add_gc_map(the_pc - start, map);
2521 }
2522
2523
2524 Label reguard;
2525 Label reguard_done;
2526 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2527 __ jcc(Assembler::equal, reguard);
2528 __ bind(reguard_done);
2529
2530 // native result if any is live
2531
2532 // Unlock
2533 Label slow_path_unlock;
2534 Label unlock_done;
2535 if (method->is_synchronized()) {
2536
2537 Label fast_done;
2538
2539 // Get locked oop from the handle we passed to jni
2540 __ movptr(obj_reg, Address(oop_handle_reg, 0));
2541
2542 // Must save rax if it is live now because cmpxchg must use it
2543 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2544 save_native_result(masm, ret_type, stack_slots);
2545 }
2546
2547 __ fast_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2548
2549 // slow path re-enters here
2550 __ bind(unlock_done);
2551 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2552 restore_native_result(masm, ret_type, stack_slots);
2553 }
2554
2555 __ bind(fast_done);
2556 }
2557 if (DTraceMethodProbes) {
2558 save_native_result(masm, ret_type, stack_slots);
2559 __ mov_metadata(c_rarg1, method());
2560 __ call_VM_leaf(
2561 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2562 r15_thread, c_rarg1);
2563 restore_native_result(masm, ret_type, stack_slots);
2564 }
2565
2566 __ reset_last_Java_frame(false);
2567
2568 // Unbox oop result, e.g. JNIHandles::resolve value.
2569 if (is_reference_type(ret_type)) {
2570 __ resolve_jobject(rax /* value */,
2571 rcx /* tmp */);
2572 }
2573
2574 if (CheckJNICalls) {
2575 // clear_pending_jni_exception_check
2576 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2577 }
2578
2579 // reset handle block
2580 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2581 __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2582
2583 // pop our frame
2584
2585 __ leave();
2586
2587 #if INCLUDE_JFR
2588 // We need to do a poll test after unwind in case the sampler
2589 // managed to sample the native frame after returning to Java.
2590 Label L_return;
2591 address poll_test_pc = __ pc();
2592 __ relocate(relocInfo::poll_return_type);
2593 __ testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2594 __ jccb(Assembler::zero, L_return);
2595 __ lea(rscratch1, InternalAddress(poll_test_pc));
2596 __ movptr(Address(r15_thread, JavaThread::saved_exception_pc_offset()), rscratch1);
2597 assert(SharedRuntime::polling_page_return_handler_blob() != nullptr,
2598 "polling page return stub not created yet");
2599 address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
2600 __ jump(RuntimeAddress(stub));
2601 __ bind(L_return);
2602 #endif // INCLUDE_JFR
2603
2604 // Any exception pending?
2605 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2606 __ jcc(Assembler::notEqual, exception_pending);
2607
2608 // Return
2609
2610 __ ret(0);
2611
2612 // Unexpected paths are out of line and go here
2613
2614 // forward the exception
2615 __ bind(exception_pending);
2616
2617 // and forward the exception
2618 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2619
2620 // Slow path locking & unlocking
2621 if (method->is_synchronized()) {
2622
2623 // BEGIN Slow path lock
2624 __ bind(slow_path_lock);
2625
2626 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2627 // args are (oop obj, BasicLock* lock, JavaThread* thread)
2628
2629 // protect the args we've loaded
2630 save_args(masm, total_c_args, c_arg, out_regs);
2631
2632 __ mov(c_rarg0, obj_reg);
2633 __ mov(c_rarg1, lock_reg);
2634 __ mov(c_rarg2, r15_thread);
2635
2636 // Not a leaf but we have last_Java_frame setup as we want.
2637 // We don't want to unmount in case of contention since that would complicate preserving
2638 // the arguments that had already been marshalled into the native convention. So we force
2639 // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame())
2640 // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack.
2641 __ push_cont_fastpath();
2642 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2643 __ pop_cont_fastpath();
2644 restore_args(masm, total_c_args, c_arg, out_regs);
2645
2646 #ifdef ASSERT
2647 { Label L;
2648 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2649 __ jcc(Assembler::equal, L);
2650 __ stop("no pending exception allowed on exit from monitorenter");
2651 __ bind(L);
2652 }
2653 #endif
2654 __ jmp(lock_done);
2655
2656 // END Slow path lock
2657
2658 // BEGIN Slow path unlock
2659 __ bind(slow_path_unlock);
2660
2661 // If we haven't already saved the native result we must save it now as xmm registers
2662 // are still exposed.
2663 __ vzeroupper();
2664 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2665 save_native_result(masm, ret_type, stack_slots);
2666 }
2667
2668 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2669
2670 __ mov(c_rarg0, obj_reg);
2671 __ mov(c_rarg2, r15_thread);
2672 __ mov(r12, rsp); // remember sp
2673 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2674 __ andptr(rsp, -16); // align stack as required by ABI
2675
2676 // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2677 // NOTE that obj_reg == rbx currently
2678 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2679 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2680
2681 // args are (oop obj, BasicLock* lock, JavaThread* thread)
2682 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2683 __ mov(rsp, r12); // restore sp
2684 __ reinit_heapbase();
2685 #ifdef ASSERT
2686 {
2687 Label L;
2688 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2689 __ jcc(Assembler::equal, L);
2690 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2691 __ bind(L);
2692 }
2693 #endif /* ASSERT */
2694
2695 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2696
2697 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2698 restore_native_result(masm, ret_type, stack_slots);
2699 }
2700 __ jmp(unlock_done);
2701
2702 // END Slow path unlock
2703
2704 } // synchronized
2705
2706 // SLOW PATH Reguard the stack if needed
2707
2708 __ bind(reguard);
2709 __ vzeroupper();
2710 save_native_result(masm, ret_type, stack_slots);
2711 __ mov(r12, rsp); // remember sp
2712 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2713 __ andptr(rsp, -16); // align stack as required by ABI
2714 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2715 __ mov(rsp, r12); // restore sp
2716 __ reinit_heapbase();
2717 restore_native_result(masm, ret_type, stack_slots);
2718 // and continue
2719 __ jmp(reguard_done);
2720
2721
2722
2723 __ flush();
2724
2725 nmethod *nm = nmethod::new_native_nmethod(method,
2726 compile_id,
2727 masm->code(),
2728 vep_offset,
2729 frame_complete,
2730 stack_slots / VMRegImpl::slots_per_word,
2731 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2732 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2733 oop_maps);
2734
2735 return nm;
2736 }
2737
2738 // this function returns the adjust size (in number of words) to a c2i adapter
2739 // activation for use during deoptimization
2740 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2741 return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2742 }
2743
2744
2745 uint SharedRuntime::out_preserve_stack_slots() {
2746 return 0;
2747 }
2748
2749
2750 // Number of stack slots between incoming argument block and the start of
2751 // a new frame. The PROLOG must add this many slots to the stack. The
2752 // EPILOG must remove this many slots. amd64 needs two slots for
2753 // return address.
2754 uint SharedRuntime::in_preserve_stack_slots() {
2755 return 4 + 2 * VerifyStackAtCalls;
2756 }
2757
2758 VMReg SharedRuntime::thread_register() {
2759 return r15_thread->as_VMReg();
2760 }
2761
2762 //------------------------------generate_deopt_blob----------------------------
2763 void SharedRuntime::generate_deopt_blob() {
2764 // Allocate space for the code
2765 ResourceMark rm;
2766 // Setup code generation tools
2767 int pad = 0;
2768 if (UseAVX > 2) {
2769 pad += 1024;
2770 }
2771 if (UseAPX) {
2772 pad += 1024;
2773 }
2774 #if INCLUDE_JVMCI
2775 if (EnableJVMCI) {
2776 pad += 512; // Increase the buffer size when compiling for JVMCI
2777 }
2778 #endif
2779 const char* name = SharedRuntime::stub_name(StubId::shared_deopt_id);
2780 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
2781 if (blob != nullptr) {
2782 _deopt_blob = blob->as_deoptimization_blob();
2783 return;
2784 }
2785
2786 CodeBuffer buffer(name, 2560+pad, 1024);
2787 MacroAssembler* masm = new MacroAssembler(&buffer);
2788 int frame_size_in_words;
2789 OopMap* map = nullptr;
2790 OopMapSet *oop_maps = new OopMapSet();
2791
2792 // -------------
2793 // This code enters when returning to a de-optimized nmethod. A return
2794 // address has been pushed on the stack, and return values are in
2795 // registers.
2796 // If we are doing a normal deopt then we were called from the patched
2797 // nmethod from the point we returned to the nmethod. So the return
2798 // address on the stack is wrong by NativeCall::instruction_size
2799 // We will adjust the value so it looks like we have the original return
2800 // address on the stack (like when we eagerly deoptimized).
2801 // In the case of an exception pending when deoptimizing, we enter
2802 // with a return address on the stack that points after the call we patched
2803 // into the exception handler. We have the following register state from,
2804 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2805 // rax: exception oop
2806 // rbx: exception handler
2807 // rdx: throwing pc
2808 // So in this case we simply jam rdx into the useless return address and
2809 // the stack looks just like we want.
2810 //
2811 // At this point we need to de-opt. We save the argument return
2812 // registers. We call the first C routine, fetch_unroll_info(). This
2813 // routine captures the return values and returns a structure which
2814 // describes the current frame size and the sizes of all replacement frames.
2815 // The current frame is compiled code and may contain many inlined
2816 // functions, each with their own JVM state. We pop the current frame, then
2817 // push all the new frames. Then we call the C routine unpack_frames() to
2818 // populate these frames. Finally unpack_frames() returns us the new target
2819 // address. Notice that callee-save registers are BLOWN here; they have
2820 // already been captured in the vframeArray at the time the return PC was
2821 // patched.
2822 address start = __ pc();
2823 Label cont;
2824
2825 // Prolog for non exception case!
2826
2827 // Save everything in sight.
2828 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2829
2830 // Normal deoptimization. Save exec mode for unpack_frames.
2831 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2832 __ jmp(cont);
2833
2834 int reexecute_offset = __ pc() - start;
2835 #if INCLUDE_JVMCI && !defined(COMPILER1)
2836 if (UseJVMCICompiler) {
2837 // JVMCI does not use this kind of deoptimization
2838 __ should_not_reach_here();
2839 }
2840 #endif
2841
2842 // Reexecute case
2843 // return address is the pc describes what bci to do re-execute at
2844
2845 // No need to update map as each call to save_live_registers will produce identical oopmap
2846 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2847
2848 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2849 __ jmp(cont);
2850
2851 #if INCLUDE_JVMCI
2852 Label after_fetch_unroll_info_call;
2853 int implicit_exception_uncommon_trap_offset = 0;
2854 int uncommon_trap_offset = 0;
2855
2856 if (EnableJVMCI) {
2857 implicit_exception_uncommon_trap_offset = __ pc() - start;
2858
2859 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2860 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2861
2862 uncommon_trap_offset = __ pc() - start;
2863
2864 // Save everything in sight.
2865 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2866 // fetch_unroll_info needs to call last_java_frame()
2867 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2868
2869 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2870 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2871
2872 __ movl(r14, Deoptimization::Unpack_reexecute);
2873 __ mov(c_rarg0, r15_thread);
2874 __ movl(c_rarg2, r14); // exec mode
2875 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2876 oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2877
2878 __ reset_last_Java_frame(false);
2879
2880 __ jmp(after_fetch_unroll_info_call);
2881 } // EnableJVMCI
2882 #endif // INCLUDE_JVMCI
2883
2884 int exception_offset = __ pc() - start;
2885
2886 // Prolog for exception case
2887
2888 // all registers are dead at this entry point, except for rax, and
2889 // rdx which contain the exception oop and exception pc
2890 // respectively. Set them in TLS and fall thru to the
2891 // unpack_with_exception_in_tls entry point.
2892
2893 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2894 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2895
2896 int exception_in_tls_offset = __ pc() - start;
2897
2898 // new implementation because exception oop is now passed in JavaThread
2899
2900 // Prolog for exception case
2901 // All registers must be preserved because they might be used by LinearScan
2902 // Exceptiop oop and throwing PC are passed in JavaThread
2903 // tos: stack at point of call to method that threw the exception (i.e. only
2904 // args are on the stack, no return address)
2905
2906 // make room on stack for the return address
2907 // It will be patched later with the throwing pc. The correct value is not
2908 // available now because loading it from memory would destroy registers.
2909 __ push(0);
2910
2911 // Save everything in sight.
2912 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2913
2914 // Now it is safe to overwrite any register
2915
2916 // Deopt during an exception. Save exec mode for unpack_frames.
2917 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2918
2919 // load throwing pc from JavaThread and patch it as the return address
2920 // of the current frame. Then clear the field in JavaThread
2921
2922 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2923 __ movptr(Address(rbp, wordSize), rdx);
2924 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2925
2926 #ifdef ASSERT
2927 // verify that there is really an exception oop in JavaThread
2928 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2929 __ verify_oop(rax);
2930
2931 // verify that there is no pending exception
2932 Label no_pending_exception;
2933 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2934 __ testptr(rax, rax);
2935 __ jcc(Assembler::zero, no_pending_exception);
2936 __ stop("must not have pending exception here");
2937 __ bind(no_pending_exception);
2938 #endif
2939
2940 __ bind(cont);
2941
2942 // Call C code. Need thread and this frame, but NOT official VM entry
2943 // crud. We cannot block on this call, no GC can happen.
2944 //
2945 // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2946
2947 // fetch_unroll_info needs to call last_java_frame().
2948
2949 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2950 #ifdef ASSERT
2951 { Label L;
2952 __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2953 __ jcc(Assembler::equal, L);
2954 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2955 __ bind(L);
2956 }
2957 #endif // ASSERT
2958 __ mov(c_rarg0, r15_thread);
2959 __ movl(c_rarg1, r14); // exec_mode
2960 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2961
2962 // Need to have an oopmap that tells fetch_unroll_info where to
2963 // find any register it might need.
2964 oop_maps->add_gc_map(__ pc() - start, map);
2965
2966 __ reset_last_Java_frame(false);
2967
2968 #if INCLUDE_JVMCI
2969 if (EnableJVMCI) {
2970 __ bind(after_fetch_unroll_info_call);
2971 }
2972 #endif
2973
2974 // Load UnrollBlock* into rdi
2975 __ mov(rdi, rax);
2976
2977 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2978 Label noException;
2979 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending?
2980 __ jcc(Assembler::notEqual, noException);
2981 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2982 // QQQ this is useless it was null above
2983 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2984 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2985 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2986
2987 __ verify_oop(rax);
2988
2989 // Overwrite the result registers with the exception results.
2990 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2991 // I think this is useless
2992 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2993
2994 __ bind(noException);
2995
2996 // Only register save data is on the stack.
2997 // Now restore the result registers. Everything else is either dead
2998 // or captured in the vframeArray.
2999 RegisterSaver::restore_result_registers(masm);
3000
3001 // All of the register save area has been popped of the stack. Only the
3002 // return address remains.
3003
3004 // Pop all the frames we must move/replace.
3005 //
3006 // Frame picture (youngest to oldest)
3007 // 1: self-frame (no frame link)
3008 // 2: deopting frame (no frame link)
3009 // 3: caller of deopting frame (could be compiled/interpreted).
3010 //
3011 // Note: by leaving the return address of self-frame on the stack
3012 // and using the size of frame 2 to adjust the stack
3013 // when we are done the return to frame 3 will still be on the stack.
3014
3015 // Pop deoptimized frame
3016 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
3017 __ addptr(rsp, rcx);
3018
3019 // rsp should be pointing at the return address to the caller (3)
3020
3021 // Pick up the initial fp we should save
3022 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3023 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
3024
3025 #ifdef ASSERT
3026 // Compilers generate code that bang the stack by as much as the
3027 // interpreter would need. So this stack banging should never
3028 // trigger a fault. Verify that it does not on non product builds.
3029 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
3030 __ bang_stack_size(rbx, rcx);
3031 #endif
3032
3033 // Load address of array of frame pcs into rcx
3034 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
3035
3036 // Trash the old pc
3037 __ addptr(rsp, wordSize);
3038
3039 // Load address of array of frame sizes into rsi
3040 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
3041
3042 // Load counter into rdx
3043 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
3044
3045 // Now adjust the caller's stack to make up for the extra locals
3046 // but record the original sp so that we can save it in the skeletal interpreter
3047 // frame and the stack walking of interpreter_sender will get the unextended sp
3048 // value and not the "real" sp value.
3049
3050 const Register sender_sp = r8;
3051
3052 __ mov(sender_sp, rsp);
3053 __ movl(rbx, Address(rdi,
3054 Deoptimization::UnrollBlock::
3055 caller_adjustment_offset()));
3056 __ subptr(rsp, rbx);
3057
3058 // Push interpreter frames in a loop
3059 Label loop;
3060 __ bind(loop);
3061 __ movptr(rbx, Address(rsi, 0)); // Load frame size
3062 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand
3063 __ pushptr(Address(rcx, 0)); // Save return address
3064 __ enter(); // Save old & set new ebp
3065 __ subptr(rsp, rbx); // Prolog
3066 // This value is corrected by layout_activation_impl
3067 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3068 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
3069 __ mov(sender_sp, rsp); // Pass sender_sp to next frame
3070 __ addptr(rsi, wordSize); // Bump array pointer (sizes)
3071 __ addptr(rcx, wordSize); // Bump array pointer (pcs)
3072 __ decrementl(rdx); // Decrement counter
3073 __ jcc(Assembler::notZero, loop);
3074 __ pushptr(Address(rcx, 0)); // Save final return address
3075
3076 // Re-push self-frame
3077 __ enter(); // Save old & set new ebp
3078
3079 // Allocate a full sized register save area.
3080 // Return address and rbp are in place, so we allocate two less words.
3081 __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
3082
3083 // Restore frame locals after moving the frame
3084 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
3085 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3086
3087 // Call C code. Need thread but NOT official VM entry
3088 // crud. We cannot block on this call, no GC can happen. Call should
3089 // restore return values to their stack-slots with the new SP.
3090 //
3091 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
3092
3093 // Use rbp because the frames look interpreted now
3094 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3095 // Don't need the precise return PC here, just precise enough to point into this code blob.
3096 address the_pc = __ pc();
3097 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3098
3099 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI
3100 __ mov(c_rarg0, r15_thread);
3101 __ movl(c_rarg1, r14); // second arg: exec_mode
3102 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3103 // Revert SP alignment after call since we're going to do some SP relative addressing below
3104 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
3105
3106 // Set an oopmap for the call site
3107 // Use the same PC we used for the last java frame
3108 oop_maps->add_gc_map(the_pc - start,
3109 new OopMap( frame_size_in_words, 0 ));
3110
3111 // Clear fp AND pc
3112 __ reset_last_Java_frame(true);
3113
3114 // Collect return values
3115 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
3116 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
3117 // I think this is useless (throwing pc?)
3118 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
3119
3120 // Pop self-frame.
3121 __ leave(); // Epilog
3122
3123 // Jump to interpreter
3124 __ ret(0);
3125
3126 // Make sure all code is generated
3127 masm->flush();
3128
3129 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
3130 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
3131 #if INCLUDE_JVMCI
3132 if (EnableJVMCI) {
3133 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
3134 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
3135 }
3136 #endif
3137
3138 AOTCodeCache::store_code_blob(*_deopt_blob, AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
3139 }
3140
3141 //------------------------------generate_handler_blob------
3142 //
3143 // Generate a special Compile2Runtime blob that saves all registers,
3144 // and setup oopmap.
3145 //
3146 SafepointBlob* SharedRuntime::generate_handler_blob(StubId id, address call_ptr) {
3147 assert(StubRoutines::forward_exception_entry() != nullptr,
3148 "must be generated before");
3149 assert(is_polling_page_id(id), "expected a polling page stub id");
3150
3151 // Allocate space for the code. Setup code generation tools.
3152 const char* name = SharedRuntime::stub_name(id);
3153 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3154 if (blob != nullptr) {
3155 return blob->as_safepoint_blob();
3156 }
3157
3158 ResourceMark rm;
3159 OopMapSet *oop_maps = new OopMapSet();
3160 OopMap* map;
3161 CodeBuffer buffer(name, 2548, 1024);
3162 MacroAssembler* masm = new MacroAssembler(&buffer);
3163
3164 address start = __ pc();
3165 address call_pc = nullptr;
3166 int frame_size_in_words;
3167 bool cause_return = (id == StubId::shared_polling_page_return_handler_id);
3168 bool save_wide_vectors = (id == StubId::shared_polling_page_vectors_safepoint_handler_id);
3169
3170 // Make room for return address (or push it again)
3171 if (!cause_return) {
3172 __ push(rbx);
3173 }
3174
3175 // Save registers, fpu state, and flags
3176 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3177
3178 // The following is basically a call_VM. However, we need the precise
3179 // address of the call in order to generate an oopmap. Hence, we do all the
3180 // work ourselves.
3181
3182 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3183
3184 // The return address must always be correct so that frame constructor never
3185 // sees an invalid pc.
3186
3187 if (!cause_return) {
3188 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3189 // Additionally, rbx is a callee saved register and we can look at it later to determine
3190 // if someone changed the return address for us!
3191 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3192 __ movptr(Address(rbp, wordSize), rbx);
3193 }
3194
3195 // Do the call
3196 __ mov(c_rarg0, r15_thread);
3197 __ call(RuntimeAddress(call_ptr));
3198
3199 // Set an oopmap for the call site. This oopmap will map all
3200 // oop-registers and debug-info registers as callee-saved. This
3201 // will allow deoptimization at this safepoint to find all possible
3202 // debug-info recordings, as well as let GC find all oops.
3203
3204 oop_maps->add_gc_map( __ pc() - start, map);
3205
3206 Label noException;
3207
3208 __ reset_last_Java_frame(false);
3209
3210 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3211 __ jcc(Assembler::equal, noException);
3212
3213 // Exception pending
3214
3215 RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3216
3217 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3218
3219 // No exception case
3220 __ bind(noException);
3221
3222 Label no_adjust;
3223 #ifdef ASSERT
3224 Label bail;
3225 #endif
3226 if (!cause_return) {
3227 Label no_prefix, not_special, check_rex_prefix;
3228
3229 // If our stashed return pc was modified by the runtime we avoid touching it
3230 __ cmpptr(rbx, Address(rbp, wordSize));
3231 __ jcc(Assembler::notEqual, no_adjust);
3232
3233 // Skip over the poll instruction.
3234 // See NativeInstruction::is_safepoint_poll()
3235 // Possible encodings:
3236 // 85 00 test %eax,(%rax)
3237 // 85 01 test %eax,(%rcx)
3238 // 85 02 test %eax,(%rdx)
3239 // 85 03 test %eax,(%rbx)
3240 // 85 06 test %eax,(%rsi)
3241 // 85 07 test %eax,(%rdi)
3242 //
3243 // 41 85 00 test %eax,(%r8)
3244 // 41 85 01 test %eax,(%r9)
3245 // 41 85 02 test %eax,(%r10)
3246 // 41 85 03 test %eax,(%r11)
3247 // 41 85 06 test %eax,(%r14)
3248 // 41 85 07 test %eax,(%r15)
3249 //
3250 // 85 04 24 test %eax,(%rsp)
3251 // 41 85 04 24 test %eax,(%r12)
3252 // 85 45 00 test %eax,0x0(%rbp)
3253 // 41 85 45 00 test %eax,0x0(%r13)
3254 //
3255 // Notes:
3256 // Format of legacy MAP0 test instruction:-
3257 // [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32]
3258 // o For safepoint polling instruction "test %eax,(%rax)", encoding of first register
3259 // operand and base register of memory operand is b/w [0-8), hence we do not require
3260 // additional REX prefix where REX.B bit stores MSB bit of register encoding, which
3261 // is why two bytes encoding is sufficient here.
3262 // o For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE
3263 // register of memory operand is 1000, thus we need additional REX prefix in this case,
3264 // there by adding additional byte to instruction encoding.
3265 // o In case BASE register is one of the 32 extended GPR registers available only on targets
3266 // supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold
3267 // most significant two bits of 5 bit register encoding.
3268
3269 if (VM_Version::supports_apx_f()) {
3270 __ cmpb(Address(rbx, 0), Assembler::REX2);
3271 __ jccb(Assembler::notEqual, check_rex_prefix);
3272 __ addptr(rbx, 2);
3273 __ bind(check_rex_prefix);
3274 }
3275 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3276 __ jccb(Assembler::notEqual, no_prefix);
3277 __ addptr(rbx, 1);
3278 __ bind(no_prefix);
3279 #ifdef ASSERT
3280 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3281 #endif
3282 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3283 // r12/rsp 0x04
3284 // r13/rbp 0x05
3285 __ movzbq(rcx, Address(rbx, 1));
3286 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3287 __ subptr(rcx, 4); // looking for 0x00 .. 0x01
3288 __ cmpptr(rcx, 1);
3289 __ jccb(Assembler::above, not_special);
3290 __ addptr(rbx, 1);
3291 __ bind(not_special);
3292 #ifdef ASSERT
3293 // Verify the correct encoding of the poll we're about to skip.
3294 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3295 __ jcc(Assembler::notEqual, bail);
3296 // Mask out the modrm bits
3297 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3298 // rax encodes to 0, so if the bits are nonzero it's incorrect
3299 __ jcc(Assembler::notZero, bail);
3300 #endif
3301 // Adjust return pc forward to step over the safepoint poll instruction
3302 __ addptr(rbx, 2);
3303 __ movptr(Address(rbp, wordSize), rbx);
3304 }
3305
3306 __ bind(no_adjust);
3307 // Normal exit, restore registers and exit.
3308 RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3309 __ ret(0);
3310
3311 #ifdef ASSERT
3312 __ bind(bail);
3313 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3314 #endif
3315
3316 // Make sure all code is generated
3317 masm->flush();
3318
3319 // Fill-out other meta info
3320 SafepointBlob* sp_blob = SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3321
3322 AOTCodeCache::store_code_blob(*sp_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3323 return sp_blob;
3324 }
3325
3326 //
3327 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3328 //
3329 // Generate a stub that calls into vm to find out the proper destination
3330 // of a java call. All the argument registers are live at this point
3331 // but since this is generic code we don't know what they are and the caller
3332 // must do any gc of the args.
3333 //
3334 RuntimeStub* SharedRuntime::generate_resolve_blob(StubId id, address destination) {
3335 assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3336 assert(is_resolve_id(id), "expected a resolve stub id");
3337
3338 const char* name = SharedRuntime::stub_name(id);
3339 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3340 if (blob != nullptr) {
3341 return blob->as_runtime_stub();
3342 }
3343
3344 // allocate space for the code
3345 ResourceMark rm;
3346 CodeBuffer buffer(name, 1552, 512);
3347 MacroAssembler* masm = new MacroAssembler(&buffer);
3348
3349 int frame_size_in_words;
3350
3351 OopMapSet *oop_maps = new OopMapSet();
3352 OopMap* map = nullptr;
3353
3354 int start = __ offset();
3355
3356 // No need to save vector registers since they are caller-saved anyway.
3357 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3358
3359 int frame_complete = __ offset();
3360
3361 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3362
3363 __ mov(c_rarg0, r15_thread);
3364
3365 __ call(RuntimeAddress(destination));
3366
3367
3368 // Set an oopmap for the call site.
3369 // We need this not only for callee-saved registers, but also for volatile
3370 // registers that the compiler might be keeping live across a safepoint.
3371
3372 oop_maps->add_gc_map( __ offset() - start, map);
3373
3374 // rax contains the address we are going to jump to assuming no exception got installed
3375
3376 // clear last_Java_sp
3377 __ reset_last_Java_frame(false);
3378 // check for pending exceptions
3379 Label pending;
3380 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3381 __ jcc(Assembler::notEqual, pending);
3382
3383 // get the returned Method*
3384 __ get_vm_result_metadata(rbx);
3385 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3386
3387 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3388
3389 RegisterSaver::restore_live_registers(masm);
3390
3391 // We are back to the original state on entry and ready to go.
3392
3393 __ jmp(rax);
3394
3395 // Pending exception after the safepoint
3396
3397 __ bind(pending);
3398
3399 RegisterSaver::restore_live_registers(masm);
3400
3401 // exception pending => remove activation and forward to exception handler
3402
3403 __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
3404
3405 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3406 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3407
3408 // -------------
3409 // make sure all code is generated
3410 masm->flush();
3411
3412 // return the blob
3413 // frame_size_words or bytes??
3414 RuntimeStub* rs_blob = RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3415
3416 AOTCodeCache::store_code_blob(*rs_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3417 return rs_blob;
3418 }
3419
3420 // Continuation point for throwing of implicit exceptions that are
3421 // not handled in the current activation. Fabricates an exception
3422 // oop and initiates normal exception dispatching in this
3423 // frame. Since we need to preserve callee-saved values (currently
3424 // only for C2, but done for C1 as well) we need a callee-saved oop
3425 // map and therefore have to make these stubs into RuntimeStubs
3426 // rather than BufferBlobs. If the compiler needs all registers to
3427 // be preserved between the fault point and the exception handler
3428 // then it must assume responsibility for that in
3429 // AbstractCompiler::continuation_for_implicit_null_exception or
3430 // continuation_for_implicit_division_by_zero_exception. All other
3431 // implicit exceptions (e.g., NullPointerException or
3432 // AbstractMethodError on entry) are either at call sites or
3433 // otherwise assume that stack unwinding will be initiated, so
3434 // caller saved registers were assumed volatile in the compiler.
3435 RuntimeStub* SharedRuntime::generate_throw_exception(StubId id, address runtime_entry) {
3436 assert(is_throw_id(id), "expected a throw stub id");
3437
3438 const char* name = SharedRuntime::stub_name(id);
3439
3440 // Information about frame layout at time of blocking runtime call.
3441 // Note that we only have to preserve callee-saved registers since
3442 // the compilers are responsible for supplying a continuation point
3443 // if they expect all registers to be preserved.
3444 enum layout {
3445 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3446 rbp_off2,
3447 return_off,
3448 return_off2,
3449 framesize // inclusive of return address
3450 };
3451
3452 int insts_size = 512;
3453 int locs_size = 64;
3454
3455 const char* timer_msg = "SharedRuntime generate_throw_exception";
3456 TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3457
3458 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3459 if (blob != nullptr) {
3460 return blob->as_runtime_stub();
3461 }
3462
3463 ResourceMark rm;
3464 CodeBuffer code(name, insts_size, locs_size);
3465 OopMapSet* oop_maps = new OopMapSet();
3466 MacroAssembler* masm = new MacroAssembler(&code);
3467
3468 address start = __ pc();
3469
3470 // This is an inlined and slightly modified version of call_VM
3471 // which has the ability to fetch the return PC out of
3472 // thread-local storage and also sets up last_Java_sp slightly
3473 // differently than the real call_VM
3474
3475 __ enter(); // required for proper stackwalking of RuntimeStub frame
3476
3477 assert(is_even(framesize/2), "sp not 16-byte aligned");
3478
3479 // return address and rbp are already in place
3480 __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3481
3482 int frame_complete = __ pc() - start;
3483
3484 // Set up last_Java_sp and last_Java_fp
3485 address the_pc = __ pc();
3486 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3487 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack
3488
3489 // Call runtime
3490 __ movptr(c_rarg0, r15_thread);
3491 BLOCK_COMMENT("call runtime_entry");
3492 __ call(RuntimeAddress(runtime_entry));
3493
3494 // Generate oop map
3495 OopMap* map = new OopMap(framesize, 0);
3496
3497 oop_maps->add_gc_map(the_pc - start, map);
3498
3499 __ reset_last_Java_frame(true);
3500
3501 __ leave(); // required for proper stackwalking of RuntimeStub frame
3502
3503 // check for pending exceptions
3504 #ifdef ASSERT
3505 Label L;
3506 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3507 __ jcc(Assembler::notEqual, L);
3508 __ should_not_reach_here();
3509 __ bind(L);
3510 #endif // ASSERT
3511 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3512
3513
3514 // codeBlob framesize is in words (not VMRegImpl::slot_size)
3515 RuntimeStub* stub =
3516 RuntimeStub::new_runtime_stub(name,
3517 &code,
3518 frame_complete,
3519 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3520 oop_maps, false);
3521 AOTCodeCache::store_code_blob(*stub, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3522
3523 return stub;
3524 }
3525
3526 //------------------------------Montgomery multiplication------------------------
3527 //
3528
3529 #ifndef _WINDOWS
3530
3531 // Subtract 0:b from carry:a. Return carry.
3532 static julong
3533 sub(julong a[], julong b[], julong carry, long len) {
3534 long long i = 0, cnt = len;
3535 julong tmp;
3536 asm volatile("clc; "
3537 "0: ; "
3538 "mov (%[b], %[i], 8), %[tmp]; "
3539 "sbb %[tmp], (%[a], %[i], 8); "
3540 "inc %[i]; dec %[cnt]; "
3541 "jne 0b; "
3542 "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3543 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3544 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3545 : "memory");
3546 return tmp;
3547 }
3548
3549 // Multiply (unsigned) Long A by Long B, accumulating the double-
3550 // length result into the accumulator formed of T0, T1, and T2.
3551 #define MACC(A, B, T0, T1, T2) \
3552 do { \
3553 unsigned long hi, lo; \
3554 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
3555 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
3556 : "r"(A), "a"(B) : "cc"); \
3557 } while(0)
3558
3559 // As above, but add twice the double-length result into the
3560 // accumulator.
3561 #define MACC2(A, B, T0, T1, T2) \
3562 do { \
3563 unsigned long hi, lo; \
3564 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3565 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
3566 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
3567 : "r"(A), "a"(B) : "cc"); \
3568 } while(0)
3569
3570 #else //_WINDOWS
3571
3572 static julong
3573 sub(julong a[], julong b[], julong carry, long len) {
3574 long i;
3575 julong tmp;
3576 unsigned char c = 1;
3577 for (i = 0; i < len; i++) {
3578 c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3579 a[i] = tmp;
3580 }
3581 c = _addcarry_u64(c, carry, ~0, &tmp);
3582 return tmp;
3583 }
3584
3585 // Multiply (unsigned) Long A by Long B, accumulating the double-
3586 // length result into the accumulator formed of T0, T1, and T2.
3587 #define MACC(A, B, T0, T1, T2) \
3588 do { \
3589 julong hi, lo; \
3590 lo = _umul128(A, B, &hi); \
3591 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
3592 c = _addcarry_u64(c, hi, T1, &T1); \
3593 _addcarry_u64(c, T2, 0, &T2); \
3594 } while(0)
3595
3596 // As above, but add twice the double-length result into the
3597 // accumulator.
3598 #define MACC2(A, B, T0, T1, T2) \
3599 do { \
3600 julong hi, lo; \
3601 lo = _umul128(A, B, &hi); \
3602 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
3603 c = _addcarry_u64(c, hi, T1, &T1); \
3604 _addcarry_u64(c, T2, 0, &T2); \
3605 c = _addcarry_u64(0, lo, T0, &T0); \
3606 c = _addcarry_u64(c, hi, T1, &T1); \
3607 _addcarry_u64(c, T2, 0, &T2); \
3608 } while(0)
3609
3610 #endif //_WINDOWS
3611
3612 // Fast Montgomery multiplication. The derivation of the algorithm is
3613 // in A Cryptographic Library for the Motorola DSP56000,
3614 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3615
3616 static void NOINLINE
3617 montgomery_multiply(julong a[], julong b[], julong n[],
3618 julong m[], julong inv, int len) {
3619 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3620 int i;
3621
3622 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3623
3624 for (i = 0; i < len; i++) {
3625 int j;
3626 for (j = 0; j < i; j++) {
3627 MACC(a[j], b[i-j], t0, t1, t2);
3628 MACC(m[j], n[i-j], t0, t1, t2);
3629 }
3630 MACC(a[i], b[0], t0, t1, t2);
3631 m[i] = t0 * inv;
3632 MACC(m[i], n[0], t0, t1, t2);
3633
3634 assert(t0 == 0, "broken Montgomery multiply");
3635
3636 t0 = t1; t1 = t2; t2 = 0;
3637 }
3638
3639 for (i = len; i < 2*len; i++) {
3640 int j;
3641 for (j = i-len+1; j < len; j++) {
3642 MACC(a[j], b[i-j], t0, t1, t2);
3643 MACC(m[j], n[i-j], t0, t1, t2);
3644 }
3645 m[i-len] = t0;
3646 t0 = t1; t1 = t2; t2 = 0;
3647 }
3648
3649 while (t0)
3650 t0 = sub(m, n, t0, len);
3651 }
3652
3653 // Fast Montgomery squaring. This uses asymptotically 25% fewer
3654 // multiplies so it should be up to 25% faster than Montgomery
3655 // multiplication. However, its loop control is more complex and it
3656 // may actually run slower on some machines.
3657
3658 static void NOINLINE
3659 montgomery_square(julong a[], julong n[],
3660 julong m[], julong inv, int len) {
3661 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3662 int i;
3663
3664 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3665
3666 for (i = 0; i < len; i++) {
3667 int j;
3668 int end = (i+1)/2;
3669 for (j = 0; j < end; j++) {
3670 MACC2(a[j], a[i-j], t0, t1, t2);
3671 MACC(m[j], n[i-j], t0, t1, t2);
3672 }
3673 if ((i & 1) == 0) {
3674 MACC(a[j], a[j], t0, t1, t2);
3675 }
3676 for (; j < i; j++) {
3677 MACC(m[j], n[i-j], t0, t1, t2);
3678 }
3679 m[i] = t0 * inv;
3680 MACC(m[i], n[0], t0, t1, t2);
3681
3682 assert(t0 == 0, "broken Montgomery square");
3683
3684 t0 = t1; t1 = t2; t2 = 0;
3685 }
3686
3687 for (i = len; i < 2*len; i++) {
3688 int start = i-len+1;
3689 int end = start + (len - start)/2;
3690 int j;
3691 for (j = start; j < end; j++) {
3692 MACC2(a[j], a[i-j], t0, t1, t2);
3693 MACC(m[j], n[i-j], t0, t1, t2);
3694 }
3695 if ((i & 1) == 0) {
3696 MACC(a[j], a[j], t0, t1, t2);
3697 }
3698 for (; j < len; j++) {
3699 MACC(m[j], n[i-j], t0, t1, t2);
3700 }
3701 m[i-len] = t0;
3702 t0 = t1; t1 = t2; t2 = 0;
3703 }
3704
3705 while (t0)
3706 t0 = sub(m, n, t0, len);
3707 }
3708
3709 // Swap words in a longword.
3710 static julong swap(julong x) {
3711 return (x << 32) | (x >> 32);
3712 }
3713
3714 // Copy len longwords from s to d, word-swapping as we go. The
3715 // destination array is reversed.
3716 static void reverse_words(julong *s, julong *d, int len) {
3717 d += len;
3718 while(len-- > 0) {
3719 d--;
3720 *d = swap(*s);
3721 s++;
3722 }
3723 }
3724
3725 // The threshold at which squaring is advantageous was determined
3726 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3727 #define MONTGOMERY_SQUARING_THRESHOLD 64
3728
3729 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3730 jint len, jlong inv,
3731 jint *m_ints) {
3732 assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3733 int longwords = len/2;
3734
3735 // Make very sure we don't use so much space that the stack might
3736 // overflow. 512 jints corresponds to an 16384-bit integer and
3737 // will use here a total of 8k bytes of stack space.
3738 int divisor = sizeof(julong) * 4;
3739 guarantee(longwords <= 8192 / divisor, "must be");
3740 int total_allocation = longwords * sizeof (julong) * 4;
3741 julong *scratch = (julong *)alloca(total_allocation);
3742
3743 // Local scratch arrays
3744 julong
3745 *a = scratch + 0 * longwords,
3746 *b = scratch + 1 * longwords,
3747 *n = scratch + 2 * longwords,
3748 *m = scratch + 3 * longwords;
3749
3750 reverse_words((julong *)a_ints, a, longwords);
3751 reverse_words((julong *)b_ints, b, longwords);
3752 reverse_words((julong *)n_ints, n, longwords);
3753
3754 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3755
3756 reverse_words(m, (julong *)m_ints, longwords);
3757 }
3758
3759 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3760 jint len, jlong inv,
3761 jint *m_ints) {
3762 assert(len % 2 == 0, "array length in montgomery_square must be even");
3763 int longwords = len/2;
3764
3765 // Make very sure we don't use so much space that the stack might
3766 // overflow. 512 jints corresponds to an 16384-bit integer and
3767 // will use here a total of 6k bytes of stack space.
3768 int divisor = sizeof(julong) * 3;
3769 guarantee(longwords <= (8192 / divisor), "must be");
3770 int total_allocation = longwords * sizeof (julong) * 3;
3771 julong *scratch = (julong *)alloca(total_allocation);
3772
3773 // Local scratch arrays
3774 julong
3775 *a = scratch + 0 * longwords,
3776 *n = scratch + 1 * longwords,
3777 *m = scratch + 2 * longwords;
3778
3779 reverse_words((julong *)a_ints, a, longwords);
3780 reverse_words((julong *)n_ints, n, longwords);
3781
3782 if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3783 ::montgomery_square(a, n, m, (julong)inv, longwords);
3784 } else {
3785 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3786 }
3787
3788 reverse_words(m, (julong *)m_ints, longwords);
3789 }
3790
3791 BufferedInlineTypeBlob* SharedRuntime::generate_buffered_inline_type_adapter(const InlineKlass* vk) {
3792 BufferBlob* buf = BufferBlob::create("inline types pack/unpack", 16 * K);
3793 if (buf == nullptr) {
3794 return nullptr;
3795 }
3796 CodeBuffer buffer(buf);
3797 short buffer_locs[20];
3798 buffer.insts()->initialize_shared_locs((relocInfo*)buffer_locs,
3799 sizeof(buffer_locs)/sizeof(relocInfo));
3800
3801 MacroAssembler* masm = new MacroAssembler(&buffer);
3802
3803 const Array<SigEntry>* sig_vk = vk->extended_sig();
3804 const Array<VMRegPair>* regs = vk->return_regs();
3805
3806 int pack_fields_jobject_off = __ offset();
3807 // Resolve pre-allocated buffer from JNI handle.
3808 // We cannot do this in generate_call_stub() because it requires GC code to be initialized.
3809 __ movptr(rax, Address(r13, 0));
3810 __ resolve_jobject(rax /* value */,
3811 r12 /* tmp */);
3812 __ movptr(Address(r13, 0), rax);
3813
3814 int pack_fields_off = __ offset();
3815
3816 int j = 1;
3817 for (int i = 0; i < sig_vk->length(); i++) {
3818 BasicType bt = sig_vk->at(i)._bt;
3819 if (bt == T_METADATA) {
3820 continue;
3821 }
3822 if (bt == T_VOID) {
3823 if (sig_vk->at(i-1)._bt == T_LONG ||
3824 sig_vk->at(i-1)._bt == T_DOUBLE) {
3825 j++;
3826 }
3827 continue;
3828 }
3829 int off = sig_vk->at(i)._offset;
3830 assert(off > 0, "offset in object should be positive");
3831 VMRegPair pair = regs->at(j);
3832 VMReg r_1 = pair.first();
3833 Address to(rax, off);
3834 if (bt == T_FLOAT) {
3835 __ movflt(to, r_1->as_XMMRegister());
3836 } else if (bt == T_DOUBLE) {
3837 __ movdbl(to, r_1->as_XMMRegister());
3838 } else {
3839 Register val = r_1->as_Register();
3840 assert_different_registers(to.base(), val, r14, r13, rbx, rscratch1);
3841 if (is_reference_type(bt)) {
3842 // store_heap_oop transitively calls oop_store_at which corrupts to.base(). We need to keep rax valid.
3843 __ mov(rbx, rax);
3844 Address to_with_rbx(rbx, off);
3845 __ store_heap_oop(to_with_rbx, val, r14, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
3846 } else {
3847 __ store_sized_value(to, r_1->as_Register(), type2aelembytes(bt));
3848 }
3849 }
3850 j++;
3851 }
3852 assert(j == regs->length(), "missed a field?");
3853 if (vk->supports_nullable_layouts()) {
3854 // Set the null marker
3855 __ movb(Address(rax, vk->null_marker_offset()), 1);
3856 }
3857 __ ret(0);
3858
3859 int unpack_fields_off = __ offset();
3860
3861 Label skip;
3862 Label not_null;
3863 __ testptr(rax, rax);
3864 __ jcc(Assembler::notZero, not_null);
3865
3866 // Return value is null. Zero all registers because the runtime requires a canonical
3867 // representation of a flat null.
3868 j = 1;
3869 for (int i = 0; i < sig_vk->length(); i++) {
3870 BasicType bt = sig_vk->at(i)._bt;
3871 if (bt == T_METADATA) {
3872 continue;
3873 }
3874 if (bt == T_VOID) {
3875 if (sig_vk->at(i-1)._bt == T_LONG ||
3876 sig_vk->at(i-1)._bt == T_DOUBLE) {
3877 j++;
3878 }
3879 continue;
3880 }
3881
3882 VMRegPair pair = regs->at(j);
3883 VMReg r_1 = pair.first();
3884 if (r_1->is_XMMRegister()) {
3885 __ xorps(r_1->as_XMMRegister(), r_1->as_XMMRegister());
3886 } else {
3887 __ xorl(r_1->as_Register(), r_1->as_Register());
3888 }
3889 j++;
3890 }
3891 __ jmp(skip);
3892 __ bind(not_null);
3893
3894 j = 1;
3895 for (int i = 0; i < sig_vk->length(); i++) {
3896 BasicType bt = sig_vk->at(i)._bt;
3897 if (bt == T_METADATA) {
3898 continue;
3899 }
3900 if (bt == T_VOID) {
3901 if (sig_vk->at(i-1)._bt == T_LONG ||
3902 sig_vk->at(i-1)._bt == T_DOUBLE) {
3903 j++;
3904 }
3905 continue;
3906 }
3907 int off = sig_vk->at(i)._offset;
3908 assert(off > 0, "offset in object should be positive");
3909 VMRegPair pair = regs->at(j);
3910 VMReg r_1 = pair.first();
3911 VMReg r_2 = pair.second();
3912 Address from(rax, off);
3913 if (bt == T_FLOAT) {
3914 __ movflt(r_1->as_XMMRegister(), from);
3915 } else if (bt == T_DOUBLE) {
3916 __ movdbl(r_1->as_XMMRegister(), from);
3917 } else if (bt == T_OBJECT || bt == T_ARRAY) {
3918 assert_different_registers(rax, r_1->as_Register());
3919 __ load_heap_oop(r_1->as_Register(), from);
3920 } else {
3921 assert(is_java_primitive(bt), "unexpected basic type");
3922 assert_different_registers(rax, r_1->as_Register());
3923 size_t size_in_bytes = type2aelembytes(bt);
3924 __ load_sized_value(r_1->as_Register(), from, size_in_bytes, bt != T_CHAR && bt != T_BOOLEAN);
3925 }
3926 j++;
3927 }
3928 assert(j == regs->length(), "missed a field?");
3929
3930 __ bind(skip);
3931 __ ret(0);
3932
3933 __ flush();
3934
3935 return BufferedInlineTypeBlob::create(&buffer, pack_fields_off, pack_fields_jobject_off, unpack_fields_off);
3936 }
3937
3938 #if INCLUDE_JFR
3939
3940 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
3941 // It returns a jobject handle to the event writer.
3942 // The handle is dereferenced and the return value is the event writer oop.
3943 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
3944 enum layout {
3945 rbp_off,
3946 rbpH_off,
3947 return_off,
3948 return_off2,
3949 framesize // inclusive of return address
3950 };
3951
3952 const char* name = SharedRuntime::stub_name(StubId::shared_jfr_write_checkpoint_id);
3953 CodeBuffer code(name, 1024, 64);
3954 MacroAssembler* masm = new MacroAssembler(&code);
3955 address start = __ pc();
3956
3957 __ enter();
3958 address the_pc = __ pc();
3959
3960 int frame_complete = the_pc - start;
3961
3962 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3963 __ movptr(c_rarg0, r15_thread);
3964 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
3965 __ reset_last_Java_frame(true);
3966
3967 // rax is jobject handle result, unpack and process it through a barrier.
3968 __ resolve_global_jobject(rax, c_rarg0);
3969
3970 __ leave();
3971 __ ret(0);
3972
3973 OopMapSet* oop_maps = new OopMapSet();
3974 OopMap* map = new OopMap(framesize, 1);
3975 oop_maps->add_gc_map(frame_complete, map);
3976
3977 RuntimeStub* stub =
3978 RuntimeStub::new_runtime_stub(name,
3979 &code,
3980 frame_complete,
3981 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3982 oop_maps,
3983 false);
3984 return stub;
3985 }
3986
3987 // For c2: call to return a leased buffer.
3988 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
3989 enum layout {
3990 rbp_off,
3991 rbpH_off,
3992 return_off,
3993 return_off2,
3994 framesize // inclusive of return address
3995 };
3996
3997 const char* name = SharedRuntime::stub_name(StubId::shared_jfr_return_lease_id);
3998 CodeBuffer code(name, 1024, 64);
3999 MacroAssembler* masm = new MacroAssembler(&code);
4000 address start = __ pc();
4001
4002 __ enter();
4003 address the_pc = __ pc();
4004
4005 int frame_complete = the_pc - start;
4006
4007 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
4008 __ movptr(c_rarg0, r15_thread);
4009 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
4010 __ reset_last_Java_frame(true);
4011
4012 __ leave();
4013 __ ret(0);
4014
4015 OopMapSet* oop_maps = new OopMapSet();
4016 OopMap* map = new OopMap(framesize, 1);
4017 oop_maps->add_gc_map(frame_complete, map);
4018
4019 RuntimeStub* stub =
4020 RuntimeStub::new_runtime_stub(name,
4021 &code,
4022 frame_complete,
4023 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4024 oop_maps,
4025 false);
4026 return stub;
4027 }
4028
4029 #endif // INCLUDE_JFR