1 /*
2 * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #ifndef _WINDOWS
26 #include "alloca.h"
27 #endif
28 #include "asm/macroAssembler.hpp"
29 #include "asm/macroAssembler.inline.hpp"
30 #include "code/aotCodeCache.hpp"
31 #include "code/compiledIC.hpp"
32 #include "code/debugInfoRec.hpp"
33 #include "code/nativeInst.hpp"
34 #include "code/vtableStubs.hpp"
35 #include "compiler/oopMap.hpp"
36 #include "gc/shared/collectedHeap.hpp"
37 #include "gc/shared/gcLocker.hpp"
38 #include "gc/shared/barrierSet.hpp"
39 #include "gc/shared/barrierSetAssembler.hpp"
40 #include "interpreter/interpreter.hpp"
41 #include "logging/log.hpp"
42 #include "memory/resourceArea.hpp"
43 #include "memory/universe.hpp"
44 #include "oops/klass.inline.hpp"
45 #include "oops/method.inline.hpp"
46 #include "prims/methodHandles.hpp"
47 #include "runtime/continuation.hpp"
48 #include "runtime/continuationEntry.inline.hpp"
49 #include "runtime/globals.hpp"
50 #include "runtime/jniHandles.hpp"
51 #include "runtime/safepointMechanism.hpp"
52 #include "runtime/sharedRuntime.hpp"
53 #include "runtime/signature.hpp"
54 #include "runtime/stubRoutines.hpp"
55 #include "runtime/timerTrace.hpp"
56 #include "runtime/vframeArray.hpp"
57 #include "runtime/vm_version.hpp"
58 #include "utilities/align.hpp"
59 #include "utilities/checkedCast.hpp"
60 #include "utilities/formatBuffer.hpp"
61 #include "vmreg_x86.inline.hpp"
62 #ifdef COMPILER1
63 #include "c1/c1_Runtime1.hpp"
64 #endif
65 #ifdef COMPILER2
66 #include "opto/runtime.hpp"
67 #endif
68 #if INCLUDE_SHENANDOAHGC
69 #include "gc/shenandoah/shenandoahRuntime.hpp"
70 #endif
71 #if INCLUDE_JVMCI
72 #include "jvmci/jvmciJavaClasses.hpp"
73 #endif
74
75 #define __ masm->
76
77 #ifdef PRODUCT
78 #define BLOCK_COMMENT(str) /* nothing */
79 #else
80 #define BLOCK_COMMENT(str) __ block_comment(str)
81 #endif // PRODUCT
82
83 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
84
85 class RegisterSaver {
86 // Capture info about frame layout. Layout offsets are in jint
87 // units because compiler frame slots are jints.
88 #define XSAVE_AREA_BEGIN 160
89 #define XSAVE_AREA_YMM_BEGIN 576
90 #define XSAVE_AREA_EGPRS 960
91 #define XSAVE_AREA_OPMASK_BEGIN 1088
92 #define XSAVE_AREA_ZMM_BEGIN 1152
93 #define XSAVE_AREA_UPPERBANK 1664
94 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
95 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
96 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
97 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off
98 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
99 enum layout {
100 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
101 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area
102 DEF_XMM_OFFS(0),
103 DEF_XMM_OFFS(1),
104 // 2..15 are implied in range usage
105 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
106 DEF_YMM_OFFS(0),
107 DEF_YMM_OFFS(1),
108 r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
109 r16H_off,
110 r17_off, r17H_off,
111 r18_off, r18H_off,
112 r19_off, r19H_off,
113 r20_off, r20H_off,
114 r21_off, r21H_off,
115 r22_off, r22H_off,
116 r23_off, r23H_off,
117 r24_off, r24H_off,
118 r25_off, r25H_off,
119 r26_off, r26H_off,
120 r27_off, r27H_off,
121 r28_off, r28H_off,
122 r29_off, r29H_off,
123 r30_off, r30H_off,
124 r31_off, r31H_off,
125 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
126 DEF_OPMASK_OFFS(0),
127 DEF_OPMASK_OFFS(1),
128 // 2..7 are implied in range usage
129 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
130 DEF_ZMM_OFFS(0),
131 DEF_ZMM_OFFS(1),
132 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
133 DEF_ZMM_UPPER_OFFS(16),
134 DEF_ZMM_UPPER_OFFS(17),
135 // 18..31 are implied in range usage
136 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
137 fpu_stateH_end,
138 r15_off, r15H_off,
139 r14_off, r14H_off,
140 r13_off, r13H_off,
141 r12_off, r12H_off,
142 r11_off, r11H_off,
143 r10_off, r10H_off,
144 r9_off, r9H_off,
145 r8_off, r8H_off,
146 rdi_off, rdiH_off,
147 rsi_off, rsiH_off,
148 ignore_off, ignoreH_off, // extra copy of rbp
149 rsp_off, rspH_off,
150 rbx_off, rbxH_off,
151 rdx_off, rdxH_off,
152 rcx_off, rcxH_off,
153 rax_off, raxH_off,
154 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
155 align_off, alignH_off,
156 flags_off, flagsH_off,
157 // The frame sender code expects that rbp will be in the "natural" place and
158 // will override any oopMap setting for it. We must therefore force the layout
159 // so that it agrees with the frame sender code.
160 rbp_off, rbpH_off, // copy of rbp we will restore
161 return_off, returnH_off, // slot for return address
162 reg_save_size // size in compiler stack slots
163 };
164
165 static void adjust_wide_vectors_support(bool& wide_vectors);
166
167 public:
168 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
169 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
170
171 // Offsets into the register save area
172 // Used by deoptimization when it is managing result register
173 // values on its own
174
175 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; }
176 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; }
177 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; }
178 static int r15_offset_in_bytes(void) { return BytesPerInt * r15_off; }
179 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; }
180 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
181
182 // During deoptimization only the result registers need to be restored,
183 // all the other values have already been extracted.
184 static void restore_result_registers(MacroAssembler* masm);
185 };
186
187 // TODO: Should be upstreamed separately.
188 void RegisterSaver::adjust_wide_vectors_support(bool& wide_vectors) {
189 #if COMPILER2_OR_JVMCI
190 if (wide_vectors && UseAVX == 0) {
191 wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
192 }
193 assert(!wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
194 #else
195 wide_vectors = false; // vectors are generated only by C2 and JVMCI
196 #endif
197 }
198
199 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
200 int off = 0;
201 int num_xmm_regs = XMMRegister::available_xmm_registers();
202
203 adjust_wide_vectors_support(save_wide_vectors);
204
205 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
206 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
207 // OopMap frame size is in compiler stack slots (jint's) not bytes or words
208 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
209 // CodeBlob frame size is in words.
210 int frame_size_in_words = frame_size_in_bytes / wordSize;
211 *total_frame_words = frame_size_in_words;
212
213 // Save registers, fpu state, and flags.
214 // We assume caller has already pushed the return address onto the
215 // stack, so rsp is 8-byte aligned here.
216 // We push rpb twice in this sequence because we want the real rbp
217 // to be under the return like a normal enter.
218
219 __ enter(); // rsp becomes 16-byte aligned here
220 __ pushf();
221 // Make sure rsp stays 16-byte aligned
222 __ subq(rsp, 8);
223 // Push CPU state in multiple of 16 bytes
224 __ save_legacy_gprs();
225 __ push_FPU_state();
226
227
228 // push cpu state handles this on EVEX enabled targets
229 if (save_wide_vectors) {
230 // Save upper half of YMM registers(0..15)
231 int base_addr = XSAVE_AREA_YMM_BEGIN;
232 for (int n = 0; n < 16; n++) {
233 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
234 }
235 if (VM_Version::supports_evex()) {
236 // Save upper half of ZMM registers(0..15)
237 base_addr = XSAVE_AREA_ZMM_BEGIN;
238 for (int n = 0; n < 16; n++) {
239 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
240 }
241 // Save full ZMM registers(16..num_xmm_regs)
242 base_addr = XSAVE_AREA_UPPERBANK;
243 off = 0;
244 int vector_len = Assembler::AVX_512bit;
245 for (int n = 16; n < num_xmm_regs; n++) {
246 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
247 }
248 #if COMPILER2_OR_JVMCI
249 base_addr = XSAVE_AREA_OPMASK_BEGIN;
250 off = 0;
251 for(int n = 0; n < KRegister::number_of_registers; n++) {
252 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
253 }
254 #endif
255 }
256 } else {
257 if (VM_Version::supports_evex()) {
258 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
259 int base_addr = XSAVE_AREA_UPPERBANK;
260 off = 0;
261 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit;
262 for (int n = 16; n < num_xmm_regs; n++) {
263 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
264 }
265 #if COMPILER2_OR_JVMCI
266 base_addr = XSAVE_AREA_OPMASK_BEGIN;
267 off = 0;
268 for(int n = 0; n < KRegister::number_of_registers; n++) {
269 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
270 }
271 #endif
272 }
273 }
274
275 #if COMPILER2_OR_JVMCI
276 if (UseAPX) {
277 int base_addr = XSAVE_AREA_EGPRS;
278 off = 0;
279 for (int n = 16; n < Register::number_of_registers; n++) {
280 __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
281 }
282 }
283 #endif
284
285 __ vzeroupper();
286 if (frame::arg_reg_save_area_bytes != 0) {
287 // Allocate argument register save area
288 __ subptr(rsp, frame::arg_reg_save_area_bytes);
289 }
290
291 // Set an oopmap for the call site. This oopmap will map all
292 // oop-registers and debug-info registers as callee-saved. This
293 // will allow deoptimization at this safepoint to find all possible
294 // debug-info recordings, as well as let GC find all oops.
295
296 OopMapSet *oop_maps = new OopMapSet();
297 OopMap* map = new OopMap(frame_size_in_slots, 0);
298
299 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
300
301 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
302 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
303 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
304 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
305 // rbp location is known implicitly by the frame sender code, needs no oopmap
306 // and the location where rbp was saved by is ignored
307 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
308 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
309 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg());
310 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg());
311 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
312 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
313 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
314 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
315 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
316 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
317
318 if (UseAPX) {
319 map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
320 map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
321 map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
322 map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
323 map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
324 map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
325 map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
326 map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
327 map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
328 map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
329 map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
330 map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
331 map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
332 map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
333 map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
334 map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
335 }
336 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
337 // on EVEX enabled targets, we get it included in the xsave area
338 off = xmm0_off;
339 int delta = xmm1_off - off;
340 for (int n = 0; n < 16; n++) {
341 XMMRegister xmm_name = as_XMMRegister(n);
342 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
343 off += delta;
344 }
345 if (UseAVX > 2) {
346 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
347 off = zmm16_off;
348 delta = zmm17_off - off;
349 for (int n = 16; n < num_xmm_regs; n++) {
350 XMMRegister zmm_name = as_XMMRegister(n);
351 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
352 off += delta;
353 }
354 }
355
356 #if COMPILER2_OR_JVMCI
357 if (save_wide_vectors) {
358 // Save upper half of YMM registers(0..15)
359 off = ymm0_off;
360 delta = ymm1_off - ymm0_off;
361 for (int n = 0; n < 16; n++) {
362 XMMRegister ymm_name = as_XMMRegister(n);
363 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
364 off += delta;
365 }
366 if (VM_Version::supports_evex()) {
367 // Save upper half of ZMM registers(0..15)
368 off = zmm0_off;
369 delta = zmm1_off - zmm0_off;
370 for (int n = 0; n < 16; n++) {
371 XMMRegister zmm_name = as_XMMRegister(n);
372 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
373 off += delta;
374 }
375 }
376 }
377 #endif // COMPILER2_OR_JVMCI
378
379 // %%% These should all be a waste but we'll keep things as they were for now
380 if (true) {
381 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
382 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
383 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
384 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
385 // rbp location is known implicitly by the frame sender code, needs no oopmap
386 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
387 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
388 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next());
389 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next());
390 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
391 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
392 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
393 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
394 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
395 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
396 if (UseAPX) {
397 map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
398 map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
399 map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
400 map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
401 map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
402 map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
403 map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
404 map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
405 map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
406 map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
407 map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
408 map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
409 map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
410 map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
411 map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
412 map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
413 }
414 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
415 // on EVEX enabled targets, we get it included in the xsave area
416 off = xmm0H_off;
417 delta = xmm1H_off - off;
418 for (int n = 0; n < 16; n++) {
419 XMMRegister xmm_name = as_XMMRegister(n);
420 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
421 off += delta;
422 }
423 if (UseAVX > 2) {
424 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
425 off = zmm16H_off;
426 delta = zmm17H_off - off;
427 for (int n = 16; n < num_xmm_regs; n++) {
428 XMMRegister zmm_name = as_XMMRegister(n);
429 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
430 off += delta;
431 }
432 }
433 }
434
435 return map;
436 }
437
438 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
439 int num_xmm_regs = XMMRegister::available_xmm_registers();
440 if (frame::arg_reg_save_area_bytes != 0) {
441 // Pop arg register save area
442 __ addptr(rsp, frame::arg_reg_save_area_bytes);
443 }
444
445 adjust_wide_vectors_support(restore_wide_vectors);
446
447 __ vzeroupper();
448
449 // On EVEX enabled targets everything is handled in pop fpu state
450 if (restore_wide_vectors) {
451 // Restore upper half of YMM registers (0..15)
452 int base_addr = XSAVE_AREA_YMM_BEGIN;
453 for (int n = 0; n < 16; n++) {
454 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
455 }
456 if (VM_Version::supports_evex()) {
457 // Restore upper half of ZMM registers (0..15)
458 base_addr = XSAVE_AREA_ZMM_BEGIN;
459 for (int n = 0; n < 16; n++) {
460 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
461 }
462 // Restore full ZMM registers(16..num_xmm_regs)
463 base_addr = XSAVE_AREA_UPPERBANK;
464 int vector_len = Assembler::AVX_512bit;
465 int off = 0;
466 for (int n = 16; n < num_xmm_regs; n++) {
467 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
468 }
469 #if COMPILER2_OR_JVMCI
470 base_addr = XSAVE_AREA_OPMASK_BEGIN;
471 off = 0;
472 for (int n = 0; n < KRegister::number_of_registers; n++) {
473 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
474 }
475 #endif
476 }
477 } else {
478 if (VM_Version::supports_evex()) {
479 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
480 int base_addr = XSAVE_AREA_UPPERBANK;
481 int off = 0;
482 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit;
483 for (int n = 16; n < num_xmm_regs; n++) {
484 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
485 }
486 #if COMPILER2_OR_JVMCI
487 base_addr = XSAVE_AREA_OPMASK_BEGIN;
488 off = 0;
489 for (int n = 0; n < KRegister::number_of_registers; n++) {
490 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
491 }
492 #endif
493 }
494 }
495
496 #if COMPILER2_OR_JVMCI
497 if (UseAPX) {
498 int base_addr = XSAVE_AREA_EGPRS;
499 int off = 0;
500 for (int n = 16; n < Register::number_of_registers; n++) {
501 __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
502 }
503 }
504 #endif
505
506 // Recover CPU state
507 __ pop_FPU_state();
508 __ restore_legacy_gprs();
509 __ addq(rsp, 8);
510 __ popf();
511 // Get the rbp described implicitly by the calling convention (no oopMap)
512 __ pop(rbp);
513 }
514
515 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
516
517 // Just restore result register. Only used by deoptimization. By
518 // now any callee save register that needs to be restored to a c2
519 // caller of the deoptee has been extracted into the vframeArray
520 // and will be stuffed into the c2i adapter we create for later
521 // restoration so only result registers need to be restored here.
522
523 // Restore fp result register
524 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
525 // Restore integer result register
526 __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
527 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
528
529 // Pop all of the register save are off the stack except the return address
530 __ addptr(rsp, return_offset_in_bytes());
531 }
532
533 // Is vector's size (in bytes) bigger than a size saved by default?
534 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
535 bool SharedRuntime::is_wide_vector(int size) {
536 return size > 16;
537 }
538
539 // ---------------------------------------------------------------------------
540 // Read the array of BasicTypes from a signature, and compute where the
541 // arguments should go. Values in the VMRegPair regs array refer to 4-byte
542 // quantities. Values less than VMRegImpl::stack0 are registers, those above
543 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer
544 // as framesizes are fixed.
545 // VMRegImpl::stack0 refers to the first slot 0(sp).
546 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
547 // Register up to Register::number_of_registers are the 64-bit
548 // integer registers.
549
550 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
551 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit
552 // units regardless of build. Of course for i486 there is no 64 bit build
553
554 // The Java calling convention is a "shifted" version of the C ABI.
555 // By skipping the first C ABI register we can call non-static jni methods
556 // with small numbers of arguments without having to shuffle the arguments
557 // at all. Since we control the java ABI we ought to at least get some
558 // advantage out of it.
559
560 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
561 VMRegPair *regs,
562 int total_args_passed) {
563
564 // Create the mapping between argument positions and
565 // registers.
566 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
567 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
568 };
569 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
570 j_farg0, j_farg1, j_farg2, j_farg3,
571 j_farg4, j_farg5, j_farg6, j_farg7
572 };
573
574
575 uint int_args = 0;
576 uint fp_args = 0;
577 uint stk_args = 0;
578
579 for (int i = 0; i < total_args_passed; i++) {
580 switch (sig_bt[i]) {
581 case T_BOOLEAN:
582 case T_CHAR:
583 case T_BYTE:
584 case T_SHORT:
585 case T_INT:
586 if (int_args < Argument::n_int_register_parameters_j) {
587 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
588 } else {
589 stk_args = align_up(stk_args, 2);
590 regs[i].set1(VMRegImpl::stack2reg(stk_args));
591 stk_args += 1;
592 }
593 break;
594 case T_VOID:
595 // halves of T_LONG or T_DOUBLE
596 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
597 regs[i].set_bad();
598 break;
599 case T_LONG:
600 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
601 // fall through
602 case T_OBJECT:
603 case T_ARRAY:
604 case T_ADDRESS:
605 if (int_args < Argument::n_int_register_parameters_j) {
606 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
607 } else {
608 stk_args = align_up(stk_args, 2);
609 regs[i].set2(VMRegImpl::stack2reg(stk_args));
610 stk_args += 2;
611 }
612 break;
613 case T_FLOAT:
614 if (fp_args < Argument::n_float_register_parameters_j) {
615 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
616 } else {
617 stk_args = align_up(stk_args, 2);
618 regs[i].set1(VMRegImpl::stack2reg(stk_args));
619 stk_args += 1;
620 }
621 break;
622 case T_DOUBLE:
623 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
624 if (fp_args < Argument::n_float_register_parameters_j) {
625 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
626 } else {
627 stk_args = align_up(stk_args, 2);
628 regs[i].set2(VMRegImpl::stack2reg(stk_args));
629 stk_args += 2;
630 }
631 break;
632 default:
633 ShouldNotReachHere();
634 break;
635 }
636 }
637
638 return stk_args;
639 }
640
641 // Patch the callers callsite with entry to compiled code if it exists.
642 static void patch_callers_callsite(MacroAssembler *masm) {
643 Label L;
644 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
645 __ jcc(Assembler::equal, L);
646
647 // Save the current stack pointer
648 __ mov(r13, rsp);
649 // Schedule the branch target address early.
650 // Call into the VM to patch the caller, then jump to compiled callee
651 // rax isn't live so capture return address while we easily can
652 __ movptr(rax, Address(rsp, 0));
653
654 // align stack so push_CPU_state doesn't fault
655 __ andptr(rsp, -(StackAlignmentInBytes));
656 __ push_CPU_state();
657 __ vzeroupper();
658 // VM needs caller's callsite
659 // VM needs target method
660 // This needs to be a long call since we will relocate this adapter to
661 // the codeBuffer and it may not reach
662
663 // Allocate argument register save area
664 if (frame::arg_reg_save_area_bytes != 0) {
665 __ subptr(rsp, frame::arg_reg_save_area_bytes);
666 }
667 __ mov(c_rarg0, rbx);
668 __ mov(c_rarg1, rax);
669 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
670
671 // De-allocate argument register save area
672 if (frame::arg_reg_save_area_bytes != 0) {
673 __ addptr(rsp, frame::arg_reg_save_area_bytes);
674 }
675
676 __ vzeroupper();
677 __ pop_CPU_state();
678 // restore sp
679 __ mov(rsp, r13);
680 __ bind(L);
681 }
682
683 static void gen_c2i_adapter(MacroAssembler *masm,
684 int total_args_passed,
685 int comp_args_on_stack,
686 const BasicType *sig_bt,
687 const VMRegPair *regs,
688 Label& skip_fixup) {
689 // Before we get into the guts of the C2I adapter, see if we should be here
690 // at all. We've come from compiled code and are attempting to jump to the
691 // interpreter, which means the caller made a static call to get here
692 // (vcalls always get a compiled target if there is one). Check for a
693 // compiled target. If there is one, we need to patch the caller's call.
694 patch_callers_callsite(masm);
695
696 __ bind(skip_fixup);
697
698 // Since all args are passed on the stack, total_args_passed *
699 // Interpreter::stackElementSize is the space we need.
700
701 assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
702
703 int extraspace = (total_args_passed * Interpreter::stackElementSize);
704
705 // stack is aligned, keep it that way
706 // This is not currently needed or enforced by the interpreter, but
707 // we might as well conform to the ABI.
708 extraspace = align_up(extraspace, 2*wordSize);
709
710 // set senderSP value
711 __ lea(r13, Address(rsp, wordSize));
712
713 #ifdef ASSERT
714 __ check_stack_alignment(r13, "sender stack not aligned");
715 #endif
716 if (extraspace > 0) {
717 // Pop the return address
718 __ pop(rax);
719
720 __ subptr(rsp, extraspace);
721
722 // Push the return address
723 __ push(rax);
724
725 // Account for the return address location since we store it first rather
726 // than hold it in a register across all the shuffling
727 extraspace += wordSize;
728 }
729
730 #ifdef ASSERT
731 __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
732 #endif
733
734 // Now write the args into the outgoing interpreter space
735 for (int i = 0; i < total_args_passed; i++) {
736 if (sig_bt[i] == T_VOID) {
737 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
738 continue;
739 }
740
741 // offset to start parameters
742 int st_off = (total_args_passed - i) * Interpreter::stackElementSize;
743 int next_off = st_off - Interpreter::stackElementSize;
744
745 // Say 4 args:
746 // i st_off
747 // 0 32 T_LONG
748 // 1 24 T_VOID
749 // 2 16 T_OBJECT
750 // 3 8 T_BOOL
751 // - 0 return address
752 //
753 // However to make thing extra confusing. Because we can fit a long/double in
754 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
755 // leaves one slot empty and only stores to a single slot. In this case the
756 // slot that is occupied is the T_VOID slot. See I said it was confusing.
757
758 VMReg r_1 = regs[i].first();
759 VMReg r_2 = regs[i].second();
760 if (!r_1->is_valid()) {
761 assert(!r_2->is_valid(), "");
762 continue;
763 }
764 if (r_1->is_stack()) {
765 // memory to memory use rax
766 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
767 if (!r_2->is_valid()) {
768 // sign extend??
769 __ movl(rax, Address(rsp, ld_off));
770 __ movptr(Address(rsp, st_off), rax);
771
772 } else {
773
774 __ movq(rax, Address(rsp, ld_off));
775
776 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
777 // T_DOUBLE and T_LONG use two slots in the interpreter
778 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
779 // ld_off == LSW, ld_off+wordSize == MSW
780 // st_off == MSW, next_off == LSW
781 __ movq(Address(rsp, next_off), rax);
782 #ifdef ASSERT
783 // Overwrite the unused slot with known junk
784 __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
785 __ movptr(Address(rsp, st_off), rax);
786 #endif /* ASSERT */
787 } else {
788 __ movq(Address(rsp, st_off), rax);
789 }
790 }
791 } else if (r_1->is_Register()) {
792 Register r = r_1->as_Register();
793 if (!r_2->is_valid()) {
794 // must be only an int (or less ) so move only 32bits to slot
795 // why not sign extend??
796 __ movl(Address(rsp, st_off), r);
797 } else {
798 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
799 // T_DOUBLE and T_LONG use two slots in the interpreter
800 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
801 // long/double in gpr
802 #ifdef ASSERT
803 // Overwrite the unused slot with known junk
804 __ mov64(rax, CONST64(0xdeadffffdeadaaab));
805 __ movptr(Address(rsp, st_off), rax);
806 #endif /* ASSERT */
807 __ movq(Address(rsp, next_off), r);
808 } else {
809 __ movptr(Address(rsp, st_off), r);
810 }
811 }
812 } else {
813 assert(r_1->is_XMMRegister(), "");
814 if (!r_2->is_valid()) {
815 // only a float use just part of the slot
816 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
817 } else {
818 #ifdef ASSERT
819 // Overwrite the unused slot with known junk
820 __ mov64(rax, CONST64(0xdeadffffdeadaaac));
821 __ movptr(Address(rsp, st_off), rax);
822 #endif /* ASSERT */
823 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
824 }
825 }
826 }
827
828 // Schedule the branch target address early.
829 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
830 __ jmp(rcx);
831 }
832
833 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
834 int total_args_passed,
835 int comp_args_on_stack,
836 const BasicType *sig_bt,
837 const VMRegPair *regs) {
838
839 // Note: r13 contains the senderSP on entry. We must preserve it since
840 // we may do a i2c -> c2i transition if we lose a race where compiled
841 // code goes non-entrant while we get args ready.
842 // In addition we use r13 to locate all the interpreter args as
843 // we must align the stack to 16 bytes on an i2c entry else we
844 // lose alignment we expect in all compiled code and register
845 // save code can segv when fxsave instructions find improperly
846 // aligned stack pointer.
847
848 // Adapters can be frameless because they do not require the caller
849 // to perform additional cleanup work, such as correcting the stack pointer.
850 // An i2c adapter is frameless because the *caller* frame, which is interpreted,
851 // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
852 // even if a callee has modified the stack pointer.
853 // A c2i adapter is frameless because the *callee* frame, which is interpreted,
854 // routinely repairs its caller's stack pointer (from sender_sp, which is set
855 // up via the senderSP register).
856 // In other words, if *either* the caller or callee is interpreted, we can
857 // get the stack pointer repaired after a call.
858 // This is why c2i and i2c adapters cannot be indefinitely composed.
859 // In particular, if a c2i adapter were to somehow call an i2c adapter,
860 // both caller and callee would be compiled methods, and neither would
861 // clean up the stack pointer changes performed by the two adapters.
862 // If this happens, control eventually transfers back to the compiled
863 // caller, but with an uncorrected stack, causing delayed havoc.
864
865 // Must preserve original SP for loading incoming arguments because
866 // we need to align the outgoing SP for compiled code.
867 __ movptr(r11, rsp);
868
869 // Pick up the return address
870 __ pop(rax);
871
872 // Convert 4-byte c2 stack slots to words.
873 int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
874
875 if (comp_args_on_stack) {
876 __ subptr(rsp, comp_words_on_stack * wordSize);
877 }
878
879 // Ensure compiled code always sees stack at proper alignment
880 __ andptr(rsp, -16);
881
882 // push the return address and misalign the stack that youngest frame always sees
883 // as far as the placement of the call instruction
884 __ push(rax);
885
886 // Put saved SP in another register
887 const Register saved_sp = rax;
888 __ movptr(saved_sp, r11);
889
890 // Will jump to the compiled code just as if compiled code was doing it.
891 // Pre-load the register-jump target early, to schedule it better.
892 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
893
894 #if INCLUDE_JVMCI
895 if (EnableJVMCI) {
896 // check if this call should be routed towards a specific entry point
897 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
898 Label no_alternative_target;
899 __ jcc(Assembler::equal, no_alternative_target);
900 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
901 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
902 __ bind(no_alternative_target);
903 }
904 #endif // INCLUDE_JVMCI
905
906 // Now generate the shuffle code. Pick up all register args and move the
907 // rest through the floating point stack top.
908 for (int i = 0; i < total_args_passed; i++) {
909 if (sig_bt[i] == T_VOID) {
910 // Longs and doubles are passed in native word order, but misaligned
911 // in the 32-bit build.
912 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
913 continue;
914 }
915
916 // Pick up 0, 1 or 2 words from SP+offset.
917
918 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
919 "scrambled load targets?");
920 // Load in argument order going down.
921 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
922 // Point to interpreter value (vs. tag)
923 int next_off = ld_off - Interpreter::stackElementSize;
924 //
925 //
926 //
927 VMReg r_1 = regs[i].first();
928 VMReg r_2 = regs[i].second();
929 if (!r_1->is_valid()) {
930 assert(!r_2->is_valid(), "");
931 continue;
932 }
933 if (r_1->is_stack()) {
934 // Convert stack slot to an SP offset (+ wordSize to account for return address )
935 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
936
937 // We can use r13 as a temp here because compiled code doesn't need r13 as an input
938 // and if we end up going thru a c2i because of a miss a reasonable value of r13
939 // will be generated.
940 if (!r_2->is_valid()) {
941 // sign extend???
942 __ movl(r13, Address(saved_sp, ld_off));
943 __ movptr(Address(rsp, st_off), r13);
944 } else {
945 //
946 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
947 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
948 // So we must adjust where to pick up the data to match the interpreter.
949 //
950 // Interpreter local[n] == MSW, local[n+1] == LSW however locals
951 // are accessed as negative so LSW is at LOW address
952
953 // ld_off is MSW so get LSW
954 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
955 next_off : ld_off;
956 __ movq(r13, Address(saved_sp, offset));
957 // st_off is LSW (i.e. reg.first())
958 __ movq(Address(rsp, st_off), r13);
959 }
960 } else if (r_1->is_Register()) { // Register argument
961 Register r = r_1->as_Register();
962 assert(r != rax, "must be different");
963 if (r_2->is_valid()) {
964 //
965 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
966 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
967 // So we must adjust where to pick up the data to match the interpreter.
968
969 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
970 next_off : ld_off;
971
972 // this can be a misaligned move
973 __ movq(r, Address(saved_sp, offset));
974 } else {
975 // sign extend and use a full word?
976 __ movl(r, Address(saved_sp, ld_off));
977 }
978 } else {
979 if (!r_2->is_valid()) {
980 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
981 } else {
982 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
983 }
984 }
985 }
986
987 __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
988
989 // 6243940 We might end up in handle_wrong_method if
990 // the callee is deoptimized as we race thru here. If that
991 // happens we don't want to take a safepoint because the
992 // caller frame will look interpreted and arguments are now
993 // "compiled" so it is much better to make this transition
994 // invisible to the stack walking code. Unfortunately if
995 // we try and find the callee by normal means a safepoint
996 // is possible. So we stash the desired callee in the thread
997 // and the vm will find there should this case occur.
998
999 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1000
1001 // put Method* where a c2i would expect should we end up there
1002 // only needed because eof c2 resolve stubs return Method* as a result in
1003 // rax
1004 __ mov(rax, rbx);
1005 __ jmp(r11);
1006 }
1007
1008 // ---------------------------------------------------------------
1009 void SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
1010 int total_args_passed,
1011 int comp_args_on_stack,
1012 const BasicType *sig_bt,
1013 const VMRegPair *regs,
1014 address entry_address[AdapterBlob::ENTRY_COUNT]) {
1015 entry_address[AdapterBlob::I2C] = __ pc();
1016
1017 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
1018
1019 // -------------------------------------------------------------------------
1020 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls
1021 // to the interpreter. The args start out packed in the compiled layout. They
1022 // need to be unpacked into the interpreter layout. This will almost always
1023 // require some stack space. We grow the current (compiled) stack, then repack
1024 // the args. We finally end in a jump to the generic interpreter entry point.
1025 // On exit from the interpreter, the interpreter will restore our SP (lest the
1026 // compiled code, which relies solely on SP and not RBP, get sick).
1027
1028 entry_address[AdapterBlob::C2I_Unverified] = __ pc();
1029 Label skip_fixup;
1030
1031 Register data = rax;
1032 Register receiver = j_rarg0;
1033 Register temp = rbx;
1034
1035 {
1036 __ ic_check(1 /* end_alignment */);
1037 __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1038 // Method might have been compiled since the call site was patched to
1039 // interpreted if that is the case treat it as a miss so we can get
1040 // the call site corrected.
1041 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1042 __ jcc(Assembler::equal, skip_fixup);
1043 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1044 }
1045
1046 entry_address[AdapterBlob::C2I] = __ pc();
1047
1048 // Class initialization barrier for static methods
1049 entry_address[AdapterBlob::C2I_No_Clinit_Check] = nullptr;
1050 assert(VM_Version::supports_fast_class_init_checks(), "sanity");
1051 Label L_skip_barrier;
1052 Register method = rbx;
1053
1054 // Bypass the barrier for non-static methods
1055 Register flags = rscratch1;
1056 __ load_unsigned_short(flags, Address(method, Method::access_flags_offset()));
1057 __ testl(flags, JVM_ACC_STATIC);
1058 __ jcc(Assembler::zero, L_skip_barrier); // non-static
1059
1060 Register klass = rscratch1;
1061 __ load_method_holder(klass, method);
1062 __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
1063
1064 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1065
1066 __ bind(L_skip_barrier);
1067 entry_address[AdapterBlob::C2I_No_Clinit_Check] = __ pc();
1068
1069 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1070 bs->c2i_entry_barrier(masm);
1071
1072 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1073 return;
1074 }
1075
1076 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1077 VMRegPair *regs,
1078 int total_args_passed) {
1079
1080 // We return the amount of VMRegImpl stack slots we need to reserve for all
1081 // the arguments NOT counting out_preserve_stack_slots.
1082
1083 // NOTE: These arrays will have to change when c1 is ported
1084 #ifdef _WIN64
1085 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1086 c_rarg0, c_rarg1, c_rarg2, c_rarg3
1087 };
1088 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1089 c_farg0, c_farg1, c_farg2, c_farg3
1090 };
1091 #else
1092 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1093 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1094 };
1095 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1096 c_farg0, c_farg1, c_farg2, c_farg3,
1097 c_farg4, c_farg5, c_farg6, c_farg7
1098 };
1099 #endif // _WIN64
1100
1101
1102 uint int_args = 0;
1103 uint fp_args = 0;
1104 uint stk_args = 0; // inc by 2 each time
1105
1106 for (int i = 0; i < total_args_passed; i++) {
1107 switch (sig_bt[i]) {
1108 case T_BOOLEAN:
1109 case T_CHAR:
1110 case T_BYTE:
1111 case T_SHORT:
1112 case T_INT:
1113 if (int_args < Argument::n_int_register_parameters_c) {
1114 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1115 #ifdef _WIN64
1116 fp_args++;
1117 // Allocate slots for callee to stuff register args the stack.
1118 stk_args += 2;
1119 #endif
1120 } else {
1121 regs[i].set1(VMRegImpl::stack2reg(stk_args));
1122 stk_args += 2;
1123 }
1124 break;
1125 case T_LONG:
1126 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1127 // fall through
1128 case T_OBJECT:
1129 case T_ARRAY:
1130 case T_ADDRESS:
1131 case T_METADATA:
1132 if (int_args < Argument::n_int_register_parameters_c) {
1133 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1134 #ifdef _WIN64
1135 fp_args++;
1136 stk_args += 2;
1137 #endif
1138 } else {
1139 regs[i].set2(VMRegImpl::stack2reg(stk_args));
1140 stk_args += 2;
1141 }
1142 break;
1143 case T_FLOAT:
1144 if (fp_args < Argument::n_float_register_parameters_c) {
1145 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1146 #ifdef _WIN64
1147 int_args++;
1148 // Allocate slots for callee to stuff register args the stack.
1149 stk_args += 2;
1150 #endif
1151 } else {
1152 regs[i].set1(VMRegImpl::stack2reg(stk_args));
1153 stk_args += 2;
1154 }
1155 break;
1156 case T_DOUBLE:
1157 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1158 if (fp_args < Argument::n_float_register_parameters_c) {
1159 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1160 #ifdef _WIN64
1161 int_args++;
1162 // Allocate slots for callee to stuff register args the stack.
1163 stk_args += 2;
1164 #endif
1165 } else {
1166 regs[i].set2(VMRegImpl::stack2reg(stk_args));
1167 stk_args += 2;
1168 }
1169 break;
1170 case T_VOID: // Halves of longs and doubles
1171 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1172 regs[i].set_bad();
1173 break;
1174 default:
1175 ShouldNotReachHere();
1176 break;
1177 }
1178 }
1179 #ifdef _WIN64
1180 // windows abi requires that we always allocate enough stack space
1181 // for 4 64bit registers to be stored down.
1182 if (stk_args < 8) {
1183 stk_args = 8;
1184 }
1185 #endif // _WIN64
1186
1187 return stk_args;
1188 }
1189
1190 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1191 uint num_bits,
1192 uint total_args_passed) {
1193 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1194 "only certain vector sizes are supported for now");
1195
1196 static const XMMRegister VEC_ArgReg[32] = {
1197 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
1198 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1199 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1200 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1201 };
1202
1203 uint stk_args = 0;
1204 uint fp_args = 0;
1205
1206 for (uint i = 0; i < total_args_passed; i++) {
1207 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1208 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15));
1209 regs[i].set_pair(vmreg->next(next_val), vmreg);
1210 }
1211
1212 return stk_args;
1213 }
1214
1215 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1216 // We always ignore the frame_slots arg and just use the space just below frame pointer
1217 // which by this time is free to use
1218 switch (ret_type) {
1219 case T_FLOAT:
1220 __ movflt(Address(rbp, -wordSize), xmm0);
1221 break;
1222 case T_DOUBLE:
1223 __ movdbl(Address(rbp, -wordSize), xmm0);
1224 break;
1225 case T_VOID: break;
1226 default: {
1227 __ movptr(Address(rbp, -wordSize), rax);
1228 }
1229 }
1230 }
1231
1232 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1233 // We always ignore the frame_slots arg and just use the space just below frame pointer
1234 // which by this time is free to use
1235 switch (ret_type) {
1236 case T_FLOAT:
1237 __ movflt(xmm0, Address(rbp, -wordSize));
1238 break;
1239 case T_DOUBLE:
1240 __ movdbl(xmm0, Address(rbp, -wordSize));
1241 break;
1242 case T_VOID: break;
1243 default: {
1244 __ movptr(rax, Address(rbp, -wordSize));
1245 }
1246 }
1247 }
1248
1249 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1250 for ( int i = first_arg ; i < arg_count ; i++ ) {
1251 if (args[i].first()->is_Register()) {
1252 __ push(args[i].first()->as_Register());
1253 } else if (args[i].first()->is_XMMRegister()) {
1254 __ subptr(rsp, 2*wordSize);
1255 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1256 }
1257 }
1258 }
1259
1260 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1261 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1262 if (args[i].first()->is_Register()) {
1263 __ pop(args[i].first()->as_Register());
1264 } else if (args[i].first()->is_XMMRegister()) {
1265 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1266 __ addptr(rsp, 2*wordSize);
1267 }
1268 }
1269 }
1270
1271 static void verify_oop_args(MacroAssembler* masm,
1272 const methodHandle& method,
1273 const BasicType* sig_bt,
1274 const VMRegPair* regs) {
1275 Register temp_reg = rbx; // not part of any compiled calling seq
1276 if (VerifyOops) {
1277 for (int i = 0; i < method->size_of_parameters(); i++) {
1278 if (is_reference_type(sig_bt[i])) {
1279 VMReg r = regs[i].first();
1280 assert(r->is_valid(), "bad oop arg");
1281 if (r->is_stack()) {
1282 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1283 __ verify_oop(temp_reg);
1284 } else {
1285 __ verify_oop(r->as_Register());
1286 }
1287 }
1288 }
1289 }
1290 }
1291
1292 static void check_continuation_enter_argument(VMReg actual_vmreg,
1293 Register expected_reg,
1294 const char* name) {
1295 assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1296 assert(actual_vmreg->as_Register() == expected_reg,
1297 "%s is in unexpected register: %s instead of %s",
1298 name, actual_vmreg->as_Register()->name(), expected_reg->name());
1299 }
1300
1301
1302 //---------------------------- continuation_enter_setup ---------------------------
1303 //
1304 // Arguments:
1305 // None.
1306 //
1307 // Results:
1308 // rsp: pointer to blank ContinuationEntry
1309 //
1310 // Kills:
1311 // rax
1312 //
1313 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1314 assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1315 assert(in_bytes(ContinuationEntry::cont_offset()) % VMRegImpl::stack_slot_size == 0, "");
1316 assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1317
1318 stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1319 __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1320
1321 int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1322 OopMap* map = new OopMap(frame_size, 0);
1323
1324 __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1325 __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1326 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1327
1328 return map;
1329 }
1330
1331 //---------------------------- fill_continuation_entry ---------------------------
1332 //
1333 // Arguments:
1334 // rsp: pointer to blank Continuation entry
1335 // reg_cont_obj: pointer to the continuation
1336 // reg_flags: flags
1337 //
1338 // Results:
1339 // rsp: pointer to filled out ContinuationEntry
1340 //
1341 // Kills:
1342 // rax
1343 //
1344 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1345 assert_different_registers(rax, reg_cont_obj, reg_flags);
1346 #ifdef ASSERT
1347 __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1348 #endif
1349 __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1350 __ movl (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1351 __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1352 __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1353 __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1354
1355 __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1356 __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1357
1358 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1359 }
1360
1361 //---------------------------- continuation_enter_cleanup ---------------------------
1362 //
1363 // Arguments:
1364 // rsp: pointer to the ContinuationEntry
1365 //
1366 // Results:
1367 // rsp: pointer to the spilled rbp in the entry frame
1368 //
1369 // Kills:
1370 // rbx
1371 //
1372 static void continuation_enter_cleanup(MacroAssembler* masm) {
1373 #ifdef ASSERT
1374 Label L_good_sp;
1375 __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1376 __ jcc(Assembler::equal, L_good_sp);
1377 __ stop("Incorrect rsp at continuation_enter_cleanup");
1378 __ bind(L_good_sp);
1379 #endif
1380 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1381 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1382 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1383 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1384 __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1385 }
1386
1387 static void gen_continuation_enter(MacroAssembler* masm,
1388 const VMRegPair* regs,
1389 int& exception_offset,
1390 OopMapSet* oop_maps,
1391 int& frame_complete,
1392 int& stack_slots,
1393 int& interpreted_entry_offset,
1394 int& compiled_entry_offset) {
1395
1396 // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1397 int pos_cont_obj = 0;
1398 int pos_is_cont = 1;
1399 int pos_is_virtual = 2;
1400
1401 // The platform-specific calling convention may present the arguments in various registers.
1402 // To simplify the rest of the code, we expect the arguments to reside at these known
1403 // registers, and we additionally check the placement here in case calling convention ever
1404 // changes.
1405 Register reg_cont_obj = c_rarg1;
1406 Register reg_is_cont = c_rarg2;
1407 Register reg_is_virtual = c_rarg3;
1408
1409 check_continuation_enter_argument(regs[pos_cont_obj].first(), reg_cont_obj, "Continuation object");
1410 check_continuation_enter_argument(regs[pos_is_cont].first(), reg_is_cont, "isContinue");
1411 check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1412
1413 // Utility methods kill rax, make sure there are no collisions
1414 assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1415
1416 AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1417 relocInfo::static_call_type);
1418
1419 address start = __ pc();
1420
1421 Label L_thaw, L_exit;
1422
1423 // i2i entry used at interp_only_mode only
1424 interpreted_entry_offset = __ pc() - start;
1425 {
1426 #ifdef ASSERT
1427 Label is_interp_only;
1428 __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1429 __ jcc(Assembler::notEqual, is_interp_only);
1430 __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1431 __ bind(is_interp_only);
1432 #endif
1433
1434 __ pop(rax); // return address
1435 // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1436 __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1437 __ movl(c_rarg2, Address(rsp, Interpreter::stackElementSize*1));
1438 __ movl(c_rarg3, Address(rsp, Interpreter::stackElementSize*0));
1439 __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1440 __ push(rax); // return address
1441 __ push_cont_fastpath();
1442
1443 __ enter();
1444
1445 stack_slots = 2; // will be adjusted in setup
1446 OopMap* map = continuation_enter_setup(masm, stack_slots);
1447 // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1448 // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1449
1450 __ verify_oop(reg_cont_obj);
1451
1452 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1453
1454 // If continuation, call to thaw. Otherwise, resolve the call and exit.
1455 __ testptr(reg_is_cont, reg_is_cont);
1456 __ jcc(Assembler::notZero, L_thaw);
1457
1458 // --- Resolve path
1459
1460 // Make sure the call is patchable
1461 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1462 // Emit stub for static call
1463 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1464 if (stub == nullptr) {
1465 fatal("CodeCache is full at gen_continuation_enter");
1466 }
1467 __ call(resolve);
1468 oop_maps->add_gc_map(__ pc() - start, map);
1469 __ post_call_nop();
1470
1471 __ jmp(L_exit);
1472 }
1473
1474 // compiled entry
1475 __ align(CodeEntryAlignment);
1476 compiled_entry_offset = __ pc() - start;
1477 __ enter();
1478
1479 stack_slots = 2; // will be adjusted in setup
1480 OopMap* map = continuation_enter_setup(masm, stack_slots);
1481
1482 // Frame is now completed as far as size and linkage.
1483 frame_complete = __ pc() - start;
1484
1485 __ verify_oop(reg_cont_obj);
1486
1487 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1488
1489 // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1490 __ testptr(reg_is_cont, reg_is_cont);
1491 __ jccb(Assembler::notZero, L_thaw);
1492
1493 // --- call Continuation.enter(Continuation c, boolean isContinue)
1494
1495 // Make sure the call is patchable
1496 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1497
1498 // Emit stub for static call
1499 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1500 if (stub == nullptr) {
1501 fatal("CodeCache is full at gen_continuation_enter");
1502 }
1503
1504 // The call needs to be resolved. There's a special case for this in
1505 // SharedRuntime::find_callee_info_helper() which calls
1506 // LinkResolver::resolve_continuation_enter() which resolves the call to
1507 // Continuation.enter(Continuation c, boolean isContinue).
1508 __ call(resolve);
1509
1510 oop_maps->add_gc_map(__ pc() - start, map);
1511 __ post_call_nop();
1512
1513 __ jmpb(L_exit);
1514
1515 // --- Thawing path
1516
1517 __ bind(L_thaw);
1518
1519 ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1520 __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1521
1522 ContinuationEntry::_return_pc_offset = __ pc() - start;
1523 oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1524 __ post_call_nop();
1525
1526 // --- Normal exit (resolve/thawing)
1527
1528 __ bind(L_exit);
1529 ContinuationEntry::_cleanup_offset = __ pc() - start;
1530 continuation_enter_cleanup(masm);
1531 __ pop(rbp);
1532 __ ret(0);
1533
1534 // --- Exception handling path
1535
1536 exception_offset = __ pc() - start;
1537
1538 continuation_enter_cleanup(masm);
1539 __ pop(rbp);
1540
1541 __ movptr(c_rarg0, r15_thread);
1542 __ movptr(c_rarg1, Address(rsp, 0)); // return address
1543
1544 // rax still holds the original exception oop, save it before the call
1545 __ push(rax);
1546
1547 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1548 __ movptr(rbx, rax);
1549
1550 // Continue at exception handler:
1551 // rax: exception oop
1552 // rbx: exception handler
1553 // rdx: exception pc
1554 __ pop(rax);
1555 __ verify_oop(rax);
1556 __ pop(rdx);
1557 __ jmp(rbx);
1558 }
1559
1560 static void gen_continuation_yield(MacroAssembler* masm,
1561 const VMRegPair* regs,
1562 OopMapSet* oop_maps,
1563 int& frame_complete,
1564 int& stack_slots,
1565 int& compiled_entry_offset) {
1566 enum layout {
1567 rbp_off,
1568 rbpH_off,
1569 return_off,
1570 return_off2,
1571 framesize // inclusive of return address
1572 };
1573 stack_slots = framesize / VMRegImpl::slots_per_word;
1574 assert(stack_slots == 2, "recheck layout");
1575
1576 address start = __ pc();
1577 compiled_entry_offset = __ pc() - start;
1578 __ enter();
1579 address the_pc = __ pc();
1580
1581 frame_complete = the_pc - start;
1582
1583 // This nop must be exactly at the PC we push into the frame info.
1584 // We use this nop for fast CodeBlob lookup, associate the OopMap
1585 // with it right away.
1586 __ post_call_nop();
1587 OopMap* map = new OopMap(framesize, 1);
1588 oop_maps->add_gc_map(frame_complete, map);
1589
1590 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1591 __ movptr(c_rarg0, r15_thread);
1592 __ movptr(c_rarg1, rsp);
1593 __ call_VM_leaf(Continuation::freeze_entry(), 2);
1594 __ reset_last_Java_frame(true);
1595
1596 Label L_pinned;
1597
1598 __ testptr(rax, rax);
1599 __ jcc(Assembler::notZero, L_pinned);
1600
1601 __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1602 continuation_enter_cleanup(masm);
1603 __ pop(rbp);
1604 __ ret(0);
1605
1606 __ bind(L_pinned);
1607
1608 // Pinned, return to caller
1609
1610 // handle pending exception thrown by freeze
1611 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1612 Label ok;
1613 __ jcc(Assembler::equal, ok);
1614 __ leave();
1615 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1616 __ bind(ok);
1617
1618 __ leave();
1619 __ ret(0);
1620 }
1621
1622 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1623 ::continuation_enter_cleanup(masm);
1624 }
1625
1626 static void gen_special_dispatch(MacroAssembler* masm,
1627 const methodHandle& method,
1628 const BasicType* sig_bt,
1629 const VMRegPair* regs) {
1630 verify_oop_args(masm, method, sig_bt, regs);
1631 vmIntrinsics::ID iid = method->intrinsic_id();
1632
1633 // Now write the args into the outgoing interpreter space
1634 bool has_receiver = false;
1635 Register receiver_reg = noreg;
1636 int member_arg_pos = -1;
1637 Register member_reg = noreg;
1638 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1639 if (ref_kind != 0) {
1640 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument
1641 member_reg = rbx; // known to be free at this point
1642 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1643 } else if (iid == vmIntrinsics::_invokeBasic) {
1644 has_receiver = true;
1645 } else if (iid == vmIntrinsics::_linkToNative) {
1646 member_arg_pos = method->size_of_parameters() - 1; // trailing NativeEntryPoint argument
1647 member_reg = rbx; // known to be free at this point
1648 } else {
1649 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1650 }
1651
1652 if (member_reg != noreg) {
1653 // Load the member_arg into register, if necessary.
1654 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1655 VMReg r = regs[member_arg_pos].first();
1656 if (r->is_stack()) {
1657 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1658 } else {
1659 // no data motion is needed
1660 member_reg = r->as_Register();
1661 }
1662 }
1663
1664 if (has_receiver) {
1665 // Make sure the receiver is loaded into a register.
1666 assert(method->size_of_parameters() > 0, "oob");
1667 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1668 VMReg r = regs[0].first();
1669 assert(r->is_valid(), "bad receiver arg");
1670 if (r->is_stack()) {
1671 // Porting note: This assumes that compiled calling conventions always
1672 // pass the receiver oop in a register. If this is not true on some
1673 // platform, pick a temp and load the receiver from stack.
1674 fatal("receiver always in a register");
1675 receiver_reg = j_rarg0; // known to be free at this point
1676 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1677 } else {
1678 // no data motion is needed
1679 receiver_reg = r->as_Register();
1680 }
1681 }
1682
1683 // Figure out which address we are really jumping to:
1684 MethodHandles::generate_method_handle_dispatch(masm, iid,
1685 receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1686 }
1687
1688 // ---------------------------------------------------------------------------
1689 // Generate a native wrapper for a given method. The method takes arguments
1690 // in the Java compiled code convention, marshals them to the native
1691 // convention (handlizes oops, etc), transitions to native, makes the call,
1692 // returns to java state (possibly blocking), unhandlizes any result and
1693 // returns.
1694 //
1695 // Critical native functions are a shorthand for the use of
1696 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1697 // functions. The wrapper is expected to unpack the arguments before
1698 // passing them to the callee. Critical native functions leave the state _in_Java,
1699 // since they cannot stop for GC.
1700 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1701 // block and the check for pending exceptions it's impossible for them
1702 // to be thrown.
1703 //
1704 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1705 const methodHandle& method,
1706 int compile_id,
1707 BasicType* in_sig_bt,
1708 VMRegPair* in_regs,
1709 BasicType ret_type) {
1710 if (method->is_continuation_native_intrinsic()) {
1711 int exception_offset = -1;
1712 OopMapSet* oop_maps = new OopMapSet();
1713 int frame_complete = -1;
1714 int stack_slots = -1;
1715 int interpreted_entry_offset = -1;
1716 int vep_offset = -1;
1717 if (method->is_continuation_enter_intrinsic()) {
1718 gen_continuation_enter(masm,
1719 in_regs,
1720 exception_offset,
1721 oop_maps,
1722 frame_complete,
1723 stack_slots,
1724 interpreted_entry_offset,
1725 vep_offset);
1726 } else if (method->is_continuation_yield_intrinsic()) {
1727 gen_continuation_yield(masm,
1728 in_regs,
1729 oop_maps,
1730 frame_complete,
1731 stack_slots,
1732 vep_offset);
1733 } else {
1734 guarantee(false, "Unknown Continuation native intrinsic");
1735 }
1736
1737 #ifdef ASSERT
1738 if (method->is_continuation_enter_intrinsic()) {
1739 assert(interpreted_entry_offset != -1, "Must be set");
1740 assert(exception_offset != -1, "Must be set");
1741 } else {
1742 assert(interpreted_entry_offset == -1, "Must be unset");
1743 assert(exception_offset == -1, "Must be unset");
1744 }
1745 assert(frame_complete != -1, "Must be set");
1746 assert(stack_slots != -1, "Must be set");
1747 assert(vep_offset != -1, "Must be set");
1748 #endif
1749
1750 __ flush();
1751 nmethod* nm = nmethod::new_native_nmethod(method,
1752 compile_id,
1753 masm->code(),
1754 vep_offset,
1755 frame_complete,
1756 stack_slots,
1757 in_ByteSize(-1),
1758 in_ByteSize(-1),
1759 oop_maps,
1760 exception_offset);
1761 if (nm == nullptr) return nm;
1762 if (method->is_continuation_enter_intrinsic()) {
1763 ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1764 } else if (method->is_continuation_yield_intrinsic()) {
1765 _cont_doYield_stub = nm;
1766 }
1767 return nm;
1768 }
1769
1770 if (method->is_method_handle_intrinsic()) {
1771 vmIntrinsics::ID iid = method->intrinsic_id();
1772 intptr_t start = (intptr_t)__ pc();
1773 int vep_offset = ((intptr_t)__ pc()) - start;
1774 gen_special_dispatch(masm,
1775 method,
1776 in_sig_bt,
1777 in_regs);
1778 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period
1779 __ flush();
1780 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually
1781 return nmethod::new_native_nmethod(method,
1782 compile_id,
1783 masm->code(),
1784 vep_offset,
1785 frame_complete,
1786 stack_slots / VMRegImpl::slots_per_word,
1787 in_ByteSize(-1),
1788 in_ByteSize(-1),
1789 nullptr);
1790 }
1791 address native_func = method->native_function();
1792 assert(native_func != nullptr, "must have function");
1793
1794 // An OopMap for lock (and class if static)
1795 OopMapSet *oop_maps = new OopMapSet();
1796 intptr_t start = (intptr_t)__ pc();
1797
1798 // We have received a description of where all the java arg are located
1799 // on entry to the wrapper. We need to convert these args to where
1800 // the jni function will expect them. To figure out where they go
1801 // we convert the java signature to a C signature by inserting
1802 // the hidden arguments as arg[0] and possibly arg[1] (static method)
1803
1804 const int total_in_args = method->size_of_parameters();
1805 int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1806
1807 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1808 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1809
1810 int argc = 0;
1811 out_sig_bt[argc++] = T_ADDRESS;
1812 if (method->is_static()) {
1813 out_sig_bt[argc++] = T_OBJECT;
1814 }
1815
1816 for (int i = 0; i < total_in_args ; i++ ) {
1817 out_sig_bt[argc++] = in_sig_bt[i];
1818 }
1819
1820 // Now figure out where the args must be stored and how much stack space
1821 // they require.
1822 int out_arg_slots;
1823 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
1824
1825 // Compute framesize for the wrapper. We need to handlize all oops in
1826 // incoming registers
1827
1828 // Calculate the total number of stack slots we will need.
1829
1830 // First count the abi requirement plus all of the outgoing args
1831 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1832
1833 // Now the space for the inbound oop handle area
1834 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers
1835
1836 int oop_handle_offset = stack_slots;
1837 stack_slots += total_save_slots;
1838
1839 // Now any space we need for handlizing a klass if static method
1840
1841 int klass_slot_offset = 0;
1842 int klass_offset = -1;
1843 int lock_slot_offset = 0;
1844 bool is_static = false;
1845
1846 if (method->is_static()) {
1847 klass_slot_offset = stack_slots;
1848 stack_slots += VMRegImpl::slots_per_word;
1849 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1850 is_static = true;
1851 }
1852
1853 // Plus a lock if needed
1854
1855 if (method->is_synchronized()) {
1856 lock_slot_offset = stack_slots;
1857 stack_slots += VMRegImpl::slots_per_word;
1858 }
1859
1860 // Now a place (+2) to save return values or temp during shuffling
1861 // + 4 for return address (which we own) and saved rbp
1862 stack_slots += 6;
1863
1864 // Ok The space we have allocated will look like:
1865 //
1866 //
1867 // FP-> | |
1868 // |---------------------|
1869 // | 2 slots for moves |
1870 // |---------------------|
1871 // | lock box (if sync) |
1872 // |---------------------| <- lock_slot_offset
1873 // | klass (if static) |
1874 // |---------------------| <- klass_slot_offset
1875 // | oopHandle area |
1876 // |---------------------| <- oop_handle_offset (6 java arg registers)
1877 // | outbound memory |
1878 // | based arguments |
1879 // | |
1880 // |---------------------|
1881 // | |
1882 // SP-> | out_preserved_slots |
1883 //
1884 //
1885
1886
1887 // Now compute actual number of stack words we need rounding to make
1888 // stack properly aligned.
1889 stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1890
1891 int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1892
1893 // First thing make an ic check to see if we should even be here
1894
1895 // We are free to use all registers as temps without saving them and
1896 // restoring them except rbp. rbp is the only callee save register
1897 // as far as the interpreter and the compiler(s) are concerned.
1898
1899 const Register receiver = j_rarg0;
1900
1901 Label exception_pending;
1902
1903 assert_different_registers(receiver, rscratch1, rscratch2);
1904 __ verify_oop(receiver);
1905 __ ic_check(8 /* end_alignment */);
1906
1907 int vep_offset = ((intptr_t)__ pc()) - start;
1908
1909 if (method->needs_clinit_barrier()) {
1910 assert(VM_Version::supports_fast_class_init_checks(), "sanity");
1911 Label L_skip_barrier;
1912 Register klass = r10;
1913 __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1914 __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
1915
1916 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1917
1918 __ bind(L_skip_barrier);
1919 }
1920
1921 #ifdef COMPILER1
1922 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1923 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1924 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1925 }
1926 #endif // COMPILER1
1927
1928 // The instruction at the verified entry point must be 5 bytes or longer
1929 // because it can be patched on the fly by make_non_entrant. The stack bang
1930 // instruction fits that requirement.
1931
1932 // Generate stack overflow check
1933 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1934
1935 // Generate a new frame for the wrapper.
1936 __ enter();
1937 // -2 because return address is already present and so is saved rbp
1938 __ subptr(rsp, stack_size - 2*wordSize);
1939
1940 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1941 // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
1942 bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
1943
1944 // Frame is now completed as far as size and linkage.
1945 int frame_complete = ((intptr_t)__ pc()) - start;
1946
1947 #ifdef ASSERT
1948 __ check_stack_alignment(rsp, "improperly aligned stack");
1949 #endif /* ASSERT */
1950
1951
1952 // We use r14 as the oop handle for the receiver/klass
1953 // It is callee save so it survives the call to native
1954
1955 const Register oop_handle_reg = r14;
1956
1957 //
1958 // We immediately shuffle the arguments so that any vm call we have to
1959 // make from here on out (sync slow path, jvmti, etc.) we will have
1960 // captured the oops from our caller and have a valid oopMap for
1961 // them.
1962
1963 // -----------------
1964 // The Grand Shuffle
1965
1966 // The Java calling convention is either equal (linux) or denser (win64) than the
1967 // c calling convention. However the because of the jni_env argument the c calling
1968 // convention always has at least one more (and two for static) arguments than Java.
1969 // Therefore if we move the args from java -> c backwards then we will never have
1970 // a register->register conflict and we don't have to build a dependency graph
1971 // and figure out how to break any cycles.
1972 //
1973
1974 // Record esp-based slot for receiver on stack for non-static methods
1975 int receiver_offset = -1;
1976
1977 // This is a trick. We double the stack slots so we can claim
1978 // the oops in the caller's frame. Since we are sure to have
1979 // more args than the caller doubling is enough to make
1980 // sure we can capture all the incoming oop args from the
1981 // caller.
1982 //
1983 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1984
1985 // Mark location of rbp (someday)
1986 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1987
1988 // Use eax, ebx as temporaries during any memory-memory moves we have to do
1989 // All inbound args are referenced based on rbp and all outbound args via rsp.
1990
1991
1992 #ifdef ASSERT
1993 bool reg_destroyed[Register::number_of_registers];
1994 bool freg_destroyed[XMMRegister::number_of_registers];
1995 for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
1996 reg_destroyed[r] = false;
1997 }
1998 for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
1999 freg_destroyed[f] = false;
2000 }
2001
2002 #endif /* ASSERT */
2003
2004 // For JNI natives the incoming and outgoing registers are offset upwards.
2005 GrowableArray<int> arg_order(2 * total_in_args);
2006
2007 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2008 arg_order.push(i);
2009 arg_order.push(c_arg);
2010 }
2011
2012 for (int ai = 0; ai < arg_order.length(); ai += 2) {
2013 int i = arg_order.at(ai);
2014 int c_arg = arg_order.at(ai + 1);
2015 __ block_comment(err_msg("move %d -> %d", i, c_arg));
2016 #ifdef ASSERT
2017 if (in_regs[i].first()->is_Register()) {
2018 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2019 } else if (in_regs[i].first()->is_XMMRegister()) {
2020 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2021 }
2022 if (out_regs[c_arg].first()->is_Register()) {
2023 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2024 } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2025 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2026 }
2027 #endif /* ASSERT */
2028 switch (in_sig_bt[i]) {
2029 case T_ARRAY:
2030 case T_OBJECT:
2031 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2032 ((i == 0) && (!is_static)),
2033 &receiver_offset);
2034 break;
2035 case T_VOID:
2036 break;
2037
2038 case T_FLOAT:
2039 __ float_move(in_regs[i], out_regs[c_arg]);
2040 break;
2041
2042 case T_DOUBLE:
2043 assert( i + 1 < total_in_args &&
2044 in_sig_bt[i + 1] == T_VOID &&
2045 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2046 __ double_move(in_regs[i], out_regs[c_arg]);
2047 break;
2048
2049 case T_LONG :
2050 __ long_move(in_regs[i], out_regs[c_arg]);
2051 break;
2052
2053 case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2054
2055 default:
2056 __ move32_64(in_regs[i], out_regs[c_arg]);
2057 }
2058 }
2059
2060 int c_arg;
2061
2062 // Pre-load a static method's oop into r14. Used both by locking code and
2063 // the normal JNI call code.
2064 // point c_arg at the first arg that is already loaded in case we
2065 // need to spill before we call out
2066 c_arg = total_c_args - total_in_args;
2067
2068 if (method->is_static()) {
2069
2070 // load oop into a register
2071 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2072
2073 // Now handlize the static class mirror it's known not-null.
2074 __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2075 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2076
2077 // Now get the handle
2078 __ lea(oop_handle_reg, Address(rsp, klass_offset));
2079 // store the klass handle as second argument
2080 __ movptr(c_rarg1, oop_handle_reg);
2081 // and protect the arg if we must spill
2082 c_arg--;
2083 }
2084
2085 // Change state to native (we save the return address in the thread, since it might not
2086 // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2087 // points into the right code segment. It does not have to be the correct return pc.
2088 // We use the same pc/oopMap repeatedly when we call out
2089
2090 Label native_return;
2091 if (method->is_object_wait0()) {
2092 // For convenience we use the pc we want to resume to in case of preemption on Object.wait.
2093 __ set_last_Java_frame(rsp, noreg, native_return, rscratch1);
2094 } else {
2095 intptr_t the_pc = (intptr_t) __ pc();
2096 oop_maps->add_gc_map(the_pc - start, map);
2097
2098 __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1);
2099 }
2100
2101 // We have all of the arguments setup at this point. We must not touch any register
2102 // argument registers at this point (what if we save/restore them there are no oop?
2103
2104 if (DTraceMethodProbes) {
2105 // protect the args we've loaded
2106 save_args(masm, total_c_args, c_arg, out_regs);
2107 __ mov_metadata(c_rarg1, method());
2108 __ call_VM_leaf(
2109 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2110 r15_thread, c_rarg1);
2111 restore_args(masm, total_c_args, c_arg, out_regs);
2112 }
2113
2114 // RedefineClasses() tracing support for obsolete method entry
2115 if (log_is_enabled(Trace, redefine, class, obsolete)) {
2116 // protect the args we've loaded
2117 save_args(masm, total_c_args, c_arg, out_regs);
2118 __ mov_metadata(c_rarg1, method());
2119 __ call_VM_leaf(
2120 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2121 r15_thread, c_rarg1);
2122 restore_args(masm, total_c_args, c_arg, out_regs);
2123 }
2124
2125 // Lock a synchronized method
2126
2127 // Register definitions used by locking and unlocking
2128
2129 const Register swap_reg = rax; // Must use rax for cmpxchg instruction
2130 const Register obj_reg = rbx; // Will contain the oop
2131 const Register lock_reg = r13; // Address of compiler lock object (BasicLock)
2132
2133 Label slow_path_lock;
2134 Label lock_done;
2135
2136 if (method->is_synchronized()) {
2137 // Get the handle (the 2nd argument)
2138 __ mov(oop_handle_reg, c_rarg1);
2139
2140 // Get address of the box
2141
2142 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2143
2144 // Load the oop from the handle
2145 __ movptr(obj_reg, Address(oop_handle_reg, 0));
2146
2147 __ fast_lock(lock_reg, obj_reg, swap_reg, rscratch1, slow_path_lock);
2148
2149 // Slow path will re-enter here
2150 __ bind(lock_done);
2151 }
2152
2153 // Finally just about ready to make the JNI call
2154
2155 // get JNIEnv* which is first argument to native
2156 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2157
2158 // Now set thread in native
2159 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2160
2161 __ call(RuntimeAddress(native_func));
2162
2163 // Verify or restore cpu control state after JNI call
2164 __ restore_cpu_control_state_after_jni(rscratch1);
2165
2166 // Unpack native results.
2167 switch (ret_type) {
2168 case T_BOOLEAN: __ c2bool(rax); break;
2169 case T_CHAR : __ movzwl(rax, rax); break;
2170 case T_BYTE : __ sign_extend_byte (rax); break;
2171 case T_SHORT : __ sign_extend_short(rax); break;
2172 case T_INT : /* nothing to do */ break;
2173 case T_DOUBLE :
2174 case T_FLOAT :
2175 // Result is in xmm0 we'll save as needed
2176 break;
2177 case T_ARRAY: // Really a handle
2178 case T_OBJECT: // Really a handle
2179 break; // can't de-handlize until after safepoint check
2180 case T_VOID: break;
2181 case T_LONG: break;
2182 default : ShouldNotReachHere();
2183 }
2184
2185 // Switch thread to "native transition" state before reading the synchronization state.
2186 // This additional state is necessary because reading and testing the synchronization
2187 // state is not atomic w.r.t. GC, as this scenario demonstrates:
2188 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2189 // VM thread changes sync state to synchronizing and suspends threads for GC.
2190 // Thread A is resumed to finish this native method, but doesn't block here since it
2191 // didn't see any synchronization is progress, and escapes.
2192 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2193
2194 // Force this write out before the read below
2195 if (!UseSystemMemoryBarrier) {
2196 __ membar(Assembler::Membar_mask_bits(
2197 Assembler::LoadLoad | Assembler::LoadStore |
2198 Assembler::StoreLoad | Assembler::StoreStore));
2199 }
2200
2201 // check for safepoint operation in progress and/or pending suspend requests
2202 {
2203 Label Continue;
2204 Label slow_path;
2205
2206 __ safepoint_poll(slow_path, true /* at_return */, false /* in_nmethod */);
2207
2208 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2209 __ jcc(Assembler::equal, Continue);
2210 __ bind(slow_path);
2211
2212 // Don't use call_VM as it will see a possible pending exception and forward it
2213 // and never return here preventing us from clearing _last_native_pc down below.
2214 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2215 // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2216 // by hand.
2217 //
2218 __ vzeroupper();
2219 save_native_result(masm, ret_type, stack_slots);
2220 __ mov(c_rarg0, r15_thread);
2221 __ mov(r12, rsp); // remember sp
2222 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2223 __ andptr(rsp, -16); // align stack as required by ABI
2224 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2225 __ mov(rsp, r12); // restore sp
2226 __ reinit_heapbase();
2227 // Restore any method result value
2228 restore_native_result(masm, ret_type, stack_slots);
2229 __ bind(Continue);
2230 }
2231
2232 // change thread state
2233 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2234
2235 if (method->is_object_wait0()) {
2236 // Check preemption for Object.wait()
2237 __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset()));
2238 __ cmpptr(rscratch1, NULL_WORD);
2239 __ jccb(Assembler::equal, native_return);
2240 __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD);
2241 __ jmp(rscratch1);
2242 __ bind(native_return);
2243
2244 intptr_t the_pc = (intptr_t) __ pc();
2245 oop_maps->add_gc_map(the_pc - start, map);
2246 }
2247
2248
2249 Label reguard;
2250 Label reguard_done;
2251 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2252 __ jcc(Assembler::equal, reguard);
2253 __ bind(reguard_done);
2254
2255 // native result if any is live
2256
2257 // Unlock
2258 Label slow_path_unlock;
2259 Label unlock_done;
2260 if (method->is_synchronized()) {
2261
2262 Label fast_done;
2263
2264 // Get locked oop from the handle we passed to jni
2265 __ movptr(obj_reg, Address(oop_handle_reg, 0));
2266
2267 // Must save rax if it is live now because cmpxchg must use it
2268 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2269 save_native_result(masm, ret_type, stack_slots);
2270 }
2271
2272 __ fast_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2273
2274 // slow path re-enters here
2275 __ bind(unlock_done);
2276 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2277 restore_native_result(masm, ret_type, stack_slots);
2278 }
2279
2280 __ bind(fast_done);
2281 }
2282 if (DTraceMethodProbes) {
2283 save_native_result(masm, ret_type, stack_slots);
2284 __ mov_metadata(c_rarg1, method());
2285 __ call_VM_leaf(
2286 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2287 r15_thread, c_rarg1);
2288 restore_native_result(masm, ret_type, stack_slots);
2289 }
2290
2291 __ reset_last_Java_frame(false);
2292
2293 // Unbox oop result, e.g. JNIHandles::resolve value.
2294 if (is_reference_type(ret_type)) {
2295 __ resolve_jobject(rax /* value */,
2296 rcx /* tmp */);
2297 }
2298
2299 if (CheckJNICalls) {
2300 // clear_pending_jni_exception_check
2301 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2302 }
2303
2304 // reset handle block
2305 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2306 __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2307
2308 // pop our frame
2309
2310 __ leave();
2311
2312 #if INCLUDE_JFR
2313 // We need to do a poll test after unwind in case the sampler
2314 // managed to sample the native frame after returning to Java.
2315 Label L_return;
2316 address poll_test_pc = __ pc();
2317 __ relocate(relocInfo::poll_return_type);
2318 __ testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2319 __ jccb(Assembler::zero, L_return);
2320 __ lea(rscratch1, InternalAddress(poll_test_pc));
2321 __ movptr(Address(r15_thread, JavaThread::saved_exception_pc_offset()), rscratch1);
2322 assert(SharedRuntime::polling_page_return_handler_blob() != nullptr,
2323 "polling page return stub not created yet");
2324 address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
2325 __ jump(RuntimeAddress(stub));
2326 __ bind(L_return);
2327 #endif // INCLUDE_JFR
2328
2329 // Any exception pending?
2330 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2331 __ jcc(Assembler::notEqual, exception_pending);
2332
2333 // Return
2334
2335 __ ret(0);
2336
2337 // Unexpected paths are out of line and go here
2338
2339 // forward the exception
2340 __ bind(exception_pending);
2341
2342 // and forward the exception
2343 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2344
2345 // Slow path locking & unlocking
2346 if (method->is_synchronized()) {
2347
2348 // BEGIN Slow path lock
2349 __ bind(slow_path_lock);
2350
2351 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2352 // args are (oop obj, BasicLock* lock, JavaThread* thread)
2353
2354 // protect the args we've loaded
2355 save_args(masm, total_c_args, c_arg, out_regs);
2356
2357 __ mov(c_rarg0, obj_reg);
2358 __ mov(c_rarg1, lock_reg);
2359 __ mov(c_rarg2, r15_thread);
2360
2361 // Not a leaf but we have last_Java_frame setup as we want.
2362 // We don't want to unmount in case of contention since that would complicate preserving
2363 // the arguments that had already been marshalled into the native convention. So we force
2364 // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame())
2365 // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack.
2366 __ push_cont_fastpath();
2367 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2368 __ pop_cont_fastpath();
2369 restore_args(masm, total_c_args, c_arg, out_regs);
2370
2371 #ifdef ASSERT
2372 { Label L;
2373 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2374 __ jcc(Assembler::equal, L);
2375 __ stop("no pending exception allowed on exit from monitorenter");
2376 __ bind(L);
2377 }
2378 #endif
2379 __ jmp(lock_done);
2380
2381 // END Slow path lock
2382
2383 // BEGIN Slow path unlock
2384 __ bind(slow_path_unlock);
2385
2386 // If we haven't already saved the native result we must save it now as xmm registers
2387 // are still exposed.
2388 __ vzeroupper();
2389 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2390 save_native_result(masm, ret_type, stack_slots);
2391 }
2392
2393 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2394
2395 __ mov(c_rarg0, obj_reg);
2396 __ mov(c_rarg2, r15_thread);
2397 __ mov(r12, rsp); // remember sp
2398 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2399 __ andptr(rsp, -16); // align stack as required by ABI
2400
2401 // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2402 // NOTE that obj_reg == rbx currently
2403 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2404 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2405
2406 // args are (oop obj, BasicLock* lock, JavaThread* thread)
2407 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2408 __ mov(rsp, r12); // restore sp
2409 __ reinit_heapbase();
2410 #ifdef ASSERT
2411 {
2412 Label L;
2413 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2414 __ jcc(Assembler::equal, L);
2415 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2416 __ bind(L);
2417 }
2418 #endif /* ASSERT */
2419
2420 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2421
2422 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2423 restore_native_result(masm, ret_type, stack_slots);
2424 }
2425 __ jmp(unlock_done);
2426
2427 // END Slow path unlock
2428
2429 } // synchronized
2430
2431 // SLOW PATH Reguard the stack if needed
2432
2433 __ bind(reguard);
2434 __ vzeroupper();
2435 save_native_result(masm, ret_type, stack_slots);
2436 __ mov(r12, rsp); // remember sp
2437 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2438 __ andptr(rsp, -16); // align stack as required by ABI
2439 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2440 __ mov(rsp, r12); // restore sp
2441 __ reinit_heapbase();
2442 restore_native_result(masm, ret_type, stack_slots);
2443 // and continue
2444 __ jmp(reguard_done);
2445
2446
2447
2448 __ flush();
2449
2450 nmethod *nm = nmethod::new_native_nmethod(method,
2451 compile_id,
2452 masm->code(),
2453 vep_offset,
2454 frame_complete,
2455 stack_slots / VMRegImpl::slots_per_word,
2456 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2457 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2458 oop_maps);
2459
2460 return nm;
2461 }
2462
2463 // this function returns the adjust size (in number of words) to a c2i adapter
2464 // activation for use during deoptimization
2465 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2466 return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2467 }
2468
2469
2470 uint SharedRuntime::out_preserve_stack_slots() {
2471 return 0;
2472 }
2473
2474
2475 // Number of stack slots between incoming argument block and the start of
2476 // a new frame. The PROLOG must add this many slots to the stack. The
2477 // EPILOG must remove this many slots. amd64 needs two slots for
2478 // return address.
2479 uint SharedRuntime::in_preserve_stack_slots() {
2480 return 4 + 2 * VerifyStackAtCalls;
2481 }
2482
2483 VMReg SharedRuntime::thread_register() {
2484 return r15_thread->as_VMReg();
2485 }
2486
2487 //------------------------------generate_deopt_blob----------------------------
2488 void SharedRuntime::generate_deopt_blob() {
2489 // Allocate space for the code
2490 ResourceMark rm;
2491 // Setup code generation tools
2492 int pad = 0;
2493 if (UseAVX > 2) {
2494 pad += 1024;
2495 }
2496 if (UseAPX) {
2497 pad += 1024;
2498 }
2499 #if INCLUDE_JVMCI
2500 if (EnableJVMCI) {
2501 pad += 512; // Increase the buffer size when compiling for JVMCI
2502 }
2503 #endif
2504 const char* name = SharedRuntime::stub_name(StubId::shared_deopt_id);
2505 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
2506 if (blob != nullptr) {
2507 _deopt_blob = blob->as_deoptimization_blob();
2508 return;
2509 }
2510
2511 CodeBuffer buffer(name, 2560+pad, 1024);
2512 MacroAssembler* masm = new MacroAssembler(&buffer);
2513 int frame_size_in_words;
2514 OopMap* map = nullptr;
2515 OopMapSet *oop_maps = new OopMapSet();
2516
2517 // -------------
2518 // This code enters when returning to a de-optimized nmethod. A return
2519 // address has been pushed on the stack, and return values are in
2520 // registers.
2521 // If we are doing a normal deopt then we were called from the patched
2522 // nmethod from the point we returned to the nmethod. So the return
2523 // address on the stack is wrong by NativeCall::instruction_size
2524 // We will adjust the value so it looks like we have the original return
2525 // address on the stack (like when we eagerly deoptimized).
2526 // In the case of an exception pending when deoptimizing, we enter
2527 // with a return address on the stack that points after the call we patched
2528 // into the exception handler. We have the following register state from,
2529 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2530 // rax: exception oop
2531 // rbx: exception handler
2532 // rdx: throwing pc
2533 // So in this case we simply jam rdx into the useless return address and
2534 // the stack looks just like we want.
2535 //
2536 // At this point we need to de-opt. We save the argument return
2537 // registers. We call the first C routine, fetch_unroll_info(). This
2538 // routine captures the return values and returns a structure which
2539 // describes the current frame size and the sizes of all replacement frames.
2540 // The current frame is compiled code and may contain many inlined
2541 // functions, each with their own JVM state. We pop the current frame, then
2542 // push all the new frames. Then we call the C routine unpack_frames() to
2543 // populate these frames. Finally unpack_frames() returns us the new target
2544 // address. Notice that callee-save registers are BLOWN here; they have
2545 // already been captured in the vframeArray at the time the return PC was
2546 // patched.
2547 address start = __ pc();
2548 Label cont;
2549
2550 // Prolog for non exception case!
2551
2552 // Save everything in sight.
2553 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2554
2555 // Normal deoptimization. Save exec mode for unpack_frames.
2556 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2557 __ jmp(cont);
2558
2559 int reexecute_offset = __ pc() - start;
2560 #if INCLUDE_JVMCI && !defined(COMPILER1)
2561 if (UseJVMCICompiler) {
2562 // JVMCI does not use this kind of deoptimization
2563 __ should_not_reach_here();
2564 }
2565 #endif
2566
2567 // Reexecute case
2568 // return address is the pc describes what bci to do re-execute at
2569
2570 // No need to update map as each call to save_live_registers will produce identical oopmap
2571 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2572
2573 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2574 __ jmp(cont);
2575
2576 #if INCLUDE_JVMCI
2577 Label after_fetch_unroll_info_call;
2578 int implicit_exception_uncommon_trap_offset = 0;
2579 int uncommon_trap_offset = 0;
2580
2581 if (EnableJVMCI) {
2582 implicit_exception_uncommon_trap_offset = __ pc() - start;
2583
2584 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2585 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2586
2587 uncommon_trap_offset = __ pc() - start;
2588
2589 // Save everything in sight.
2590 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2591 // fetch_unroll_info needs to call last_java_frame()
2592 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2593
2594 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2595 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2596
2597 __ movl(r14, Deoptimization::Unpack_reexecute);
2598 __ mov(c_rarg0, r15_thread);
2599 __ movl(c_rarg2, r14); // exec mode
2600 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2601 oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2602
2603 __ reset_last_Java_frame(false);
2604
2605 __ jmp(after_fetch_unroll_info_call);
2606 } // EnableJVMCI
2607 #endif // INCLUDE_JVMCI
2608
2609 int exception_offset = __ pc() - start;
2610
2611 // Prolog for exception case
2612
2613 // all registers are dead at this entry point, except for rax, and
2614 // rdx which contain the exception oop and exception pc
2615 // respectively. Set them in TLS and fall thru to the
2616 // unpack_with_exception_in_tls entry point.
2617
2618 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2619 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2620
2621 int exception_in_tls_offset = __ pc() - start;
2622
2623 // new implementation because exception oop is now passed in JavaThread
2624
2625 // Prolog for exception case
2626 // All registers must be preserved because they might be used by LinearScan
2627 // Exceptiop oop and throwing PC are passed in JavaThread
2628 // tos: stack at point of call to method that threw the exception (i.e. only
2629 // args are on the stack, no return address)
2630
2631 // make room on stack for the return address
2632 // It will be patched later with the throwing pc. The correct value is not
2633 // available now because loading it from memory would destroy registers.
2634 __ push(0);
2635
2636 // Save everything in sight.
2637 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2638
2639 // Now it is safe to overwrite any register
2640
2641 // Deopt during an exception. Save exec mode for unpack_frames.
2642 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2643
2644 // load throwing pc from JavaThread and patch it as the return address
2645 // of the current frame. Then clear the field in JavaThread
2646
2647 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2648 __ movptr(Address(rbp, wordSize), rdx);
2649 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2650
2651 #ifdef ASSERT
2652 // verify that there is really an exception oop in JavaThread
2653 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2654 __ verify_oop(rax);
2655
2656 // verify that there is no pending exception
2657 Label no_pending_exception;
2658 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2659 __ testptr(rax, rax);
2660 __ jcc(Assembler::zero, no_pending_exception);
2661 __ stop("must not have pending exception here");
2662 __ bind(no_pending_exception);
2663 #endif
2664
2665 __ bind(cont);
2666
2667 // Call C code. Need thread and this frame, but NOT official VM entry
2668 // crud. We cannot block on this call, no GC can happen.
2669 //
2670 // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2671
2672 // fetch_unroll_info needs to call last_java_frame().
2673
2674 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2675 #ifdef ASSERT
2676 { Label L;
2677 __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2678 __ jcc(Assembler::equal, L);
2679 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2680 __ bind(L);
2681 }
2682 #endif // ASSERT
2683 __ mov(c_rarg0, r15_thread);
2684 __ movl(c_rarg1, r14); // exec_mode
2685 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2686
2687 // Need to have an oopmap that tells fetch_unroll_info where to
2688 // find any register it might need.
2689 oop_maps->add_gc_map(__ pc() - start, map);
2690
2691 __ reset_last_Java_frame(false);
2692
2693 #if INCLUDE_JVMCI
2694 if (EnableJVMCI) {
2695 __ bind(after_fetch_unroll_info_call);
2696 }
2697 #endif
2698
2699 // Load UnrollBlock* into rdi
2700 __ mov(rdi, rax);
2701
2702 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2703 Label noException;
2704 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending?
2705 __ jcc(Assembler::notEqual, noException);
2706 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2707 // QQQ this is useless it was null above
2708 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2709 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2710 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2711
2712 __ verify_oop(rax);
2713
2714 // Overwrite the result registers with the exception results.
2715 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2716 // I think this is useless
2717 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2718
2719 __ bind(noException);
2720
2721 // Only register save data is on the stack.
2722 // Now restore the result registers. Everything else is either dead
2723 // or captured in the vframeArray.
2724 RegisterSaver::restore_result_registers(masm);
2725
2726 // All of the register save area has been popped of the stack. Only the
2727 // return address remains.
2728
2729 // Pop all the frames we must move/replace.
2730 //
2731 // Frame picture (youngest to oldest)
2732 // 1: self-frame (no frame link)
2733 // 2: deopting frame (no frame link)
2734 // 3: caller of deopting frame (could be compiled/interpreted).
2735 //
2736 // Note: by leaving the return address of self-frame on the stack
2737 // and using the size of frame 2 to adjust the stack
2738 // when we are done the return to frame 3 will still be on the stack.
2739
2740 // Pop deoptimized frame
2741 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2742 __ addptr(rsp, rcx);
2743
2744 // rsp should be pointing at the return address to the caller (3)
2745
2746 // Pick up the initial fp we should save
2747 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2748 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2749
2750 #ifdef ASSERT
2751 // Compilers generate code that bang the stack by as much as the
2752 // interpreter would need. So this stack banging should never
2753 // trigger a fault. Verify that it does not on non product builds.
2754 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2755 __ bang_stack_size(rbx, rcx);
2756 #endif
2757
2758 // Load address of array of frame pcs into rcx
2759 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2760
2761 // Trash the old pc
2762 __ addptr(rsp, wordSize);
2763
2764 // Load address of array of frame sizes into rsi
2765 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2766
2767 // Load counter into rdx
2768 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2769
2770 // Now adjust the caller's stack to make up for the extra locals
2771 // but record the original sp so that we can save it in the skeletal interpreter
2772 // frame and the stack walking of interpreter_sender will get the unextended sp
2773 // value and not the "real" sp value.
2774
2775 const Register sender_sp = r8;
2776
2777 __ mov(sender_sp, rsp);
2778 __ movl(rbx, Address(rdi,
2779 Deoptimization::UnrollBlock::
2780 caller_adjustment_offset()));
2781 __ subptr(rsp, rbx);
2782
2783 // Push interpreter frames in a loop
2784 Label loop;
2785 __ bind(loop);
2786 __ movptr(rbx, Address(rsi, 0)); // Load frame size
2787 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand
2788 __ pushptr(Address(rcx, 0)); // Save return address
2789 __ enter(); // Save old & set new ebp
2790 __ subptr(rsp, rbx); // Prolog
2791 // This value is corrected by layout_activation_impl
2792 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2793 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2794 __ mov(sender_sp, rsp); // Pass sender_sp to next frame
2795 __ addptr(rsi, wordSize); // Bump array pointer (sizes)
2796 __ addptr(rcx, wordSize); // Bump array pointer (pcs)
2797 __ decrementl(rdx); // Decrement counter
2798 __ jcc(Assembler::notZero, loop);
2799 __ pushptr(Address(rcx, 0)); // Save final return address
2800
2801 // Re-push self-frame
2802 __ enter(); // Save old & set new ebp
2803
2804 // Allocate a full sized register save area.
2805 // Return address and rbp are in place, so we allocate two less words.
2806 __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2807
2808 // Restore frame locals after moving the frame
2809 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2810 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2811
2812 // Call C code. Need thread but NOT official VM entry
2813 // crud. We cannot block on this call, no GC can happen. Call should
2814 // restore return values to their stack-slots with the new SP.
2815 //
2816 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2817
2818 // Use rbp because the frames look interpreted now
2819 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2820 // Don't need the precise return PC here, just precise enough to point into this code blob.
2821 address the_pc = __ pc();
2822 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2823
2824 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI
2825 __ mov(c_rarg0, r15_thread);
2826 __ movl(c_rarg1, r14); // second arg: exec_mode
2827 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2828 // Revert SP alignment after call since we're going to do some SP relative addressing below
2829 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2830
2831 // Set an oopmap for the call site
2832 // Use the same PC we used for the last java frame
2833 oop_maps->add_gc_map(the_pc - start,
2834 new OopMap( frame_size_in_words, 0 ));
2835
2836 // Clear fp AND pc
2837 __ reset_last_Java_frame(true);
2838
2839 // Collect return values
2840 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2841 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2842 // I think this is useless (throwing pc?)
2843 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2844
2845 // Pop self-frame.
2846 __ leave(); // Epilog
2847
2848 // Jump to interpreter
2849 __ ret(0);
2850
2851 // Make sure all code is generated
2852 masm->flush();
2853
2854 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2855 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2856 #if INCLUDE_JVMCI
2857 if (EnableJVMCI) {
2858 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2859 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2860 }
2861 #endif
2862
2863 AOTCodeCache::store_code_blob(*_deopt_blob, AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
2864 }
2865
2866 //------------------------------generate_handler_blob------
2867 //
2868 // Generate a special Compile2Runtime blob that saves all registers,
2869 // and setup oopmap.
2870 //
2871 SafepointBlob* SharedRuntime::generate_handler_blob(StubId id, address call_ptr) {
2872 assert(StubRoutines::forward_exception_entry() != nullptr,
2873 "must be generated before");
2874 assert(is_polling_page_id(id), "expected a polling page stub id");
2875
2876 // Allocate space for the code. Setup code generation tools.
2877 const char* name = SharedRuntime::stub_name(id);
2878 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
2879 if (blob != nullptr) {
2880 return blob->as_safepoint_blob();
2881 }
2882
2883 ResourceMark rm;
2884 OopMapSet *oop_maps = new OopMapSet();
2885 OopMap* map;
2886 CodeBuffer buffer(name, 2548, 1024);
2887 MacroAssembler* masm = new MacroAssembler(&buffer);
2888
2889 address start = __ pc();
2890 address call_pc = nullptr;
2891 int frame_size_in_words;
2892 bool cause_return = (id == StubId::shared_polling_page_return_handler_id);
2893 bool save_wide_vectors = (id == StubId::shared_polling_page_vectors_safepoint_handler_id);
2894
2895 // Make room for return address (or push it again)
2896 if (!cause_return) {
2897 __ push(rbx);
2898 }
2899
2900 // Save registers, fpu state, and flags
2901 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
2902
2903 // The following is basically a call_VM. However, we need the precise
2904 // address of the call in order to generate an oopmap. Hence, we do all the
2905 // work ourselves.
2906
2907 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
2908
2909 // The return address must always be correct so that frame constructor never
2910 // sees an invalid pc.
2911
2912 if (!cause_return) {
2913 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
2914 // Additionally, rbx is a callee saved register and we can look at it later to determine
2915 // if someone changed the return address for us!
2916 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
2917 __ movptr(Address(rbp, wordSize), rbx);
2918 }
2919
2920 // Do the call
2921 __ mov(c_rarg0, r15_thread);
2922 __ call(RuntimeAddress(call_ptr));
2923
2924 // Set an oopmap for the call site. This oopmap will map all
2925 // oop-registers and debug-info registers as callee-saved. This
2926 // will allow deoptimization at this safepoint to find all possible
2927 // debug-info recordings, as well as let GC find all oops.
2928
2929 oop_maps->add_gc_map( __ pc() - start, map);
2930
2931 Label noException;
2932
2933 __ reset_last_Java_frame(false);
2934
2935 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
2936 __ jcc(Assembler::equal, noException);
2937
2938 // Exception pending
2939
2940 RegisterSaver::restore_live_registers(masm, save_wide_vectors);
2941
2942 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2943
2944 // No exception case
2945 __ bind(noException);
2946
2947 Label no_adjust;
2948 #ifdef ASSERT
2949 Label bail;
2950 #endif
2951 if (!cause_return) {
2952 Label no_prefix, not_special, check_rex_prefix;
2953
2954 // If our stashed return pc was modified by the runtime we avoid touching it
2955 __ cmpptr(rbx, Address(rbp, wordSize));
2956 __ jcc(Assembler::notEqual, no_adjust);
2957
2958 // Skip over the poll instruction.
2959 // See NativeInstruction::is_safepoint_poll()
2960 // Possible encodings:
2961 // 85 00 test %eax,(%rax)
2962 // 85 01 test %eax,(%rcx)
2963 // 85 02 test %eax,(%rdx)
2964 // 85 03 test %eax,(%rbx)
2965 // 85 06 test %eax,(%rsi)
2966 // 85 07 test %eax,(%rdi)
2967 //
2968 // 41 85 00 test %eax,(%r8)
2969 // 41 85 01 test %eax,(%r9)
2970 // 41 85 02 test %eax,(%r10)
2971 // 41 85 03 test %eax,(%r11)
2972 // 41 85 06 test %eax,(%r14)
2973 // 41 85 07 test %eax,(%r15)
2974 //
2975 // 85 04 24 test %eax,(%rsp)
2976 // 41 85 04 24 test %eax,(%r12)
2977 // 85 45 00 test %eax,0x0(%rbp)
2978 // 41 85 45 00 test %eax,0x0(%r13)
2979 //
2980 // Notes:
2981 // Format of legacy MAP0 test instruction:-
2982 // [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32]
2983 // o For safepoint polling instruction "test %eax,(%rax)", encoding of first register
2984 // operand and base register of memory operand is b/w [0-8), hence we do not require
2985 // additional REX prefix where REX.B bit stores MSB bit of register encoding, which
2986 // is why two bytes encoding is sufficient here.
2987 // o For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE
2988 // register of memory operand is 1000, thus we need additional REX prefix in this case,
2989 // there by adding additional byte to instruction encoding.
2990 // o In case BASE register is one of the 32 extended GPR registers available only on targets
2991 // supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold
2992 // most significant two bits of 5 bit register encoding.
2993
2994 if (VM_Version::supports_apx_f()) {
2995 __ cmpb(Address(rbx, 0), Assembler::REX2);
2996 __ jccb(Assembler::notEqual, check_rex_prefix);
2997 __ addptr(rbx, 2);
2998 __ bind(check_rex_prefix);
2999 }
3000 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3001 __ jccb(Assembler::notEqual, no_prefix);
3002 __ addptr(rbx, 1);
3003 __ bind(no_prefix);
3004 #ifdef ASSERT
3005 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3006 #endif
3007 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3008 // r12/rsp 0x04
3009 // r13/rbp 0x05
3010 __ movzbq(rcx, Address(rbx, 1));
3011 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3012 __ subptr(rcx, 4); // looking for 0x00 .. 0x01
3013 __ cmpptr(rcx, 1);
3014 __ jccb(Assembler::above, not_special);
3015 __ addptr(rbx, 1);
3016 __ bind(not_special);
3017 #ifdef ASSERT
3018 // Verify the correct encoding of the poll we're about to skip.
3019 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3020 __ jcc(Assembler::notEqual, bail);
3021 // Mask out the modrm bits
3022 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3023 // rax encodes to 0, so if the bits are nonzero it's incorrect
3024 __ jcc(Assembler::notZero, bail);
3025 #endif
3026 // Adjust return pc forward to step over the safepoint poll instruction
3027 __ addptr(rbx, 2);
3028 __ movptr(Address(rbp, wordSize), rbx);
3029 }
3030
3031 __ bind(no_adjust);
3032 // Normal exit, restore registers and exit.
3033 RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3034 __ ret(0);
3035
3036 #ifdef ASSERT
3037 __ bind(bail);
3038 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3039 #endif
3040
3041 // Make sure all code is generated
3042 masm->flush();
3043
3044 // Fill-out other meta info
3045 SafepointBlob* sp_blob = SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3046
3047 AOTCodeCache::store_code_blob(*sp_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3048 return sp_blob;
3049 }
3050
3051 //
3052 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3053 //
3054 // Generate a stub that calls into vm to find out the proper destination
3055 // of a java call. All the argument registers are live at this point
3056 // but since this is generic code we don't know what they are and the caller
3057 // must do any gc of the args.
3058 //
3059 RuntimeStub* SharedRuntime::generate_resolve_blob(StubId id, address destination) {
3060 assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3061 assert(is_resolve_id(id), "expected a resolve stub id");
3062
3063 const char* name = SharedRuntime::stub_name(id);
3064 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3065 if (blob != nullptr) {
3066 return blob->as_runtime_stub();
3067 }
3068
3069 // allocate space for the code
3070 ResourceMark rm;
3071 CodeBuffer buffer(name, 1552, 512);
3072 MacroAssembler* masm = new MacroAssembler(&buffer);
3073
3074 int frame_size_in_words;
3075
3076 OopMapSet *oop_maps = new OopMapSet();
3077 OopMap* map = nullptr;
3078
3079 int start = __ offset();
3080
3081 // No need to save vector registers since they are caller-saved anyway.
3082 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3083
3084 int frame_complete = __ offset();
3085
3086 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3087
3088 __ mov(c_rarg0, r15_thread);
3089
3090 __ call(RuntimeAddress(destination));
3091
3092
3093 // Set an oopmap for the call site.
3094 // We need this not only for callee-saved registers, but also for volatile
3095 // registers that the compiler might be keeping live across a safepoint.
3096
3097 oop_maps->add_gc_map( __ offset() - start, map);
3098
3099 // rax contains the address we are going to jump to assuming no exception got installed
3100
3101 // clear last_Java_sp
3102 __ reset_last_Java_frame(false);
3103 // check for pending exceptions
3104 Label pending;
3105 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3106 __ jcc(Assembler::notEqual, pending);
3107
3108 // get the returned Method*
3109 __ get_vm_result_metadata(rbx);
3110 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3111
3112 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3113
3114 RegisterSaver::restore_live_registers(masm);
3115
3116 // We are back to the original state on entry and ready to go.
3117
3118 __ jmp(rax);
3119
3120 // Pending exception after the safepoint
3121
3122 __ bind(pending);
3123
3124 RegisterSaver::restore_live_registers(masm);
3125
3126 // exception pending => remove activation and forward to exception handler
3127
3128 __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
3129
3130 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3131 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3132
3133 // -------------
3134 // make sure all code is generated
3135 masm->flush();
3136
3137 // return the blob
3138 // frame_size_words or bytes??
3139 RuntimeStub* rs_blob = RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3140
3141 AOTCodeCache::store_code_blob(*rs_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3142 return rs_blob;
3143 }
3144
3145 // Continuation point for throwing of implicit exceptions that are
3146 // not handled in the current activation. Fabricates an exception
3147 // oop and initiates normal exception dispatching in this
3148 // frame. Since we need to preserve callee-saved values (currently
3149 // only for C2, but done for C1 as well) we need a callee-saved oop
3150 // map and therefore have to make these stubs into RuntimeStubs
3151 // rather than BufferBlobs. If the compiler needs all registers to
3152 // be preserved between the fault point and the exception handler
3153 // then it must assume responsibility for that in
3154 // AbstractCompiler::continuation_for_implicit_null_exception or
3155 // continuation_for_implicit_division_by_zero_exception. All other
3156 // implicit exceptions (e.g., NullPointerException or
3157 // AbstractMethodError on entry) are either at call sites or
3158 // otherwise assume that stack unwinding will be initiated, so
3159 // caller saved registers were assumed volatile in the compiler.
3160 RuntimeStub* SharedRuntime::generate_throw_exception(StubId id, address runtime_entry) {
3161 assert(is_throw_id(id), "expected a throw stub id");
3162
3163 const char* name = SharedRuntime::stub_name(id);
3164
3165 // Information about frame layout at time of blocking runtime call.
3166 // Note that we only have to preserve callee-saved registers since
3167 // the compilers are responsible for supplying a continuation point
3168 // if they expect all registers to be preserved.
3169 enum layout {
3170 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3171 rbp_off2,
3172 return_off,
3173 return_off2,
3174 framesize // inclusive of return address
3175 };
3176
3177 int insts_size = 512;
3178 int locs_size = 64;
3179
3180 const char* timer_msg = "SharedRuntime generate_throw_exception";
3181 TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3182
3183 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3184 if (blob != nullptr) {
3185 return blob->as_runtime_stub();
3186 }
3187
3188 ResourceMark rm;
3189 CodeBuffer code(name, insts_size, locs_size);
3190 OopMapSet* oop_maps = new OopMapSet();
3191 MacroAssembler* masm = new MacroAssembler(&code);
3192
3193 address start = __ pc();
3194
3195 // This is an inlined and slightly modified version of call_VM
3196 // which has the ability to fetch the return PC out of
3197 // thread-local storage and also sets up last_Java_sp slightly
3198 // differently than the real call_VM
3199
3200 __ enter(); // required for proper stackwalking of RuntimeStub frame
3201
3202 assert(is_even(framesize/2), "sp not 16-byte aligned");
3203
3204 // return address and rbp are already in place
3205 __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3206
3207 int frame_complete = __ pc() - start;
3208
3209 // Set up last_Java_sp and last_Java_fp
3210 address the_pc = __ pc();
3211 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3212 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack
3213
3214 // Call runtime
3215 __ movptr(c_rarg0, r15_thread);
3216 BLOCK_COMMENT("call runtime_entry");
3217 __ call(RuntimeAddress(runtime_entry));
3218
3219 // Generate oop map
3220 OopMap* map = new OopMap(framesize, 0);
3221
3222 oop_maps->add_gc_map(the_pc - start, map);
3223
3224 __ reset_last_Java_frame(true);
3225
3226 __ leave(); // required for proper stackwalking of RuntimeStub frame
3227
3228 // check for pending exceptions
3229 #ifdef ASSERT
3230 Label L;
3231 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3232 __ jcc(Assembler::notEqual, L);
3233 __ should_not_reach_here();
3234 __ bind(L);
3235 #endif // ASSERT
3236 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3237
3238
3239 // codeBlob framesize is in words (not VMRegImpl::slot_size)
3240 RuntimeStub* stub =
3241 RuntimeStub::new_runtime_stub(name,
3242 &code,
3243 frame_complete,
3244 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3245 oop_maps, false);
3246 AOTCodeCache::store_code_blob(*stub, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3247
3248 return stub;
3249 }
3250
3251 //------------------------------Montgomery multiplication------------------------
3252 //
3253
3254 #ifndef _WINDOWS
3255
3256 // Subtract 0:b from carry:a. Return carry.
3257 static julong
3258 sub(julong a[], julong b[], julong carry, long len) {
3259 long long i = 0, cnt = len;
3260 julong tmp;
3261 asm volatile("clc; "
3262 "0: ; "
3263 "mov (%[b], %[i], 8), %[tmp]; "
3264 "sbb %[tmp], (%[a], %[i], 8); "
3265 "inc %[i]; dec %[cnt]; "
3266 "jne 0b; "
3267 "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3268 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3269 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3270 : "memory");
3271 return tmp;
3272 }
3273
3274 // Multiply (unsigned) Long A by Long B, accumulating the double-
3275 // length result into the accumulator formed of T0, T1, and T2.
3276 #define MACC(A, B, T0, T1, T2) \
3277 do { \
3278 unsigned long hi, lo; \
3279 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
3280 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
3281 : "r"(A), "a"(B) : "cc"); \
3282 } while(0)
3283
3284 // As above, but add twice the double-length result into the
3285 // accumulator.
3286 #define MACC2(A, B, T0, T1, T2) \
3287 do { \
3288 unsigned long hi, lo; \
3289 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3290 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
3291 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
3292 : "r"(A), "a"(B) : "cc"); \
3293 } while(0)
3294
3295 #else //_WINDOWS
3296
3297 static julong
3298 sub(julong a[], julong b[], julong carry, long len) {
3299 long i;
3300 julong tmp;
3301 unsigned char c = 1;
3302 for (i = 0; i < len; i++) {
3303 c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3304 a[i] = tmp;
3305 }
3306 c = _addcarry_u64(c, carry, ~0, &tmp);
3307 return tmp;
3308 }
3309
3310 // Multiply (unsigned) Long A by Long B, accumulating the double-
3311 // length result into the accumulator formed of T0, T1, and T2.
3312 #define MACC(A, B, T0, T1, T2) \
3313 do { \
3314 julong hi, lo; \
3315 lo = _umul128(A, B, &hi); \
3316 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
3317 c = _addcarry_u64(c, hi, T1, &T1); \
3318 _addcarry_u64(c, T2, 0, &T2); \
3319 } while(0)
3320
3321 // As above, but add twice the double-length result into the
3322 // accumulator.
3323 #define MACC2(A, B, T0, T1, T2) \
3324 do { \
3325 julong hi, lo; \
3326 lo = _umul128(A, B, &hi); \
3327 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
3328 c = _addcarry_u64(c, hi, T1, &T1); \
3329 _addcarry_u64(c, T2, 0, &T2); \
3330 c = _addcarry_u64(0, lo, T0, &T0); \
3331 c = _addcarry_u64(c, hi, T1, &T1); \
3332 _addcarry_u64(c, T2, 0, &T2); \
3333 } while(0)
3334
3335 #endif //_WINDOWS
3336
3337 // Fast Montgomery multiplication. The derivation of the algorithm is
3338 // in A Cryptographic Library for the Motorola DSP56000,
3339 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3340
3341 static void NOINLINE
3342 montgomery_multiply(julong a[], julong b[], julong n[],
3343 julong m[], julong inv, int len) {
3344 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3345 int i;
3346
3347 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3348
3349 for (i = 0; i < len; i++) {
3350 int j;
3351 for (j = 0; j < i; j++) {
3352 MACC(a[j], b[i-j], t0, t1, t2);
3353 MACC(m[j], n[i-j], t0, t1, t2);
3354 }
3355 MACC(a[i], b[0], t0, t1, t2);
3356 m[i] = t0 * inv;
3357 MACC(m[i], n[0], t0, t1, t2);
3358
3359 assert(t0 == 0, "broken Montgomery multiply");
3360
3361 t0 = t1; t1 = t2; t2 = 0;
3362 }
3363
3364 for (i = len; i < 2*len; i++) {
3365 int j;
3366 for (j = i-len+1; j < len; j++) {
3367 MACC(a[j], b[i-j], t0, t1, t2);
3368 MACC(m[j], n[i-j], t0, t1, t2);
3369 }
3370 m[i-len] = t0;
3371 t0 = t1; t1 = t2; t2 = 0;
3372 }
3373
3374 while (t0)
3375 t0 = sub(m, n, t0, len);
3376 }
3377
3378 // Fast Montgomery squaring. This uses asymptotically 25% fewer
3379 // multiplies so it should be up to 25% faster than Montgomery
3380 // multiplication. However, its loop control is more complex and it
3381 // may actually run slower on some machines.
3382
3383 static void NOINLINE
3384 montgomery_square(julong a[], julong n[],
3385 julong m[], julong inv, int len) {
3386 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3387 int i;
3388
3389 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3390
3391 for (i = 0; i < len; i++) {
3392 int j;
3393 int end = (i+1)/2;
3394 for (j = 0; j < end; j++) {
3395 MACC2(a[j], a[i-j], t0, t1, t2);
3396 MACC(m[j], n[i-j], t0, t1, t2);
3397 }
3398 if ((i & 1) == 0) {
3399 MACC(a[j], a[j], t0, t1, t2);
3400 }
3401 for (; j < i; j++) {
3402 MACC(m[j], n[i-j], t0, t1, t2);
3403 }
3404 m[i] = t0 * inv;
3405 MACC(m[i], n[0], t0, t1, t2);
3406
3407 assert(t0 == 0, "broken Montgomery square");
3408
3409 t0 = t1; t1 = t2; t2 = 0;
3410 }
3411
3412 for (i = len; i < 2*len; i++) {
3413 int start = i-len+1;
3414 int end = start + (len - start)/2;
3415 int j;
3416 for (j = start; j < end; j++) {
3417 MACC2(a[j], a[i-j], t0, t1, t2);
3418 MACC(m[j], n[i-j], t0, t1, t2);
3419 }
3420 if ((i & 1) == 0) {
3421 MACC(a[j], a[j], t0, t1, t2);
3422 }
3423 for (; j < len; j++) {
3424 MACC(m[j], n[i-j], t0, t1, t2);
3425 }
3426 m[i-len] = t0;
3427 t0 = t1; t1 = t2; t2 = 0;
3428 }
3429
3430 while (t0)
3431 t0 = sub(m, n, t0, len);
3432 }
3433
3434 // Swap words in a longword.
3435 static julong swap(julong x) {
3436 return (x << 32) | (x >> 32);
3437 }
3438
3439 // Copy len longwords from s to d, word-swapping as we go. The
3440 // destination array is reversed.
3441 static void reverse_words(julong *s, julong *d, int len) {
3442 d += len;
3443 while(len-- > 0) {
3444 d--;
3445 *d = swap(*s);
3446 s++;
3447 }
3448 }
3449
3450 // The threshold at which squaring is advantageous was determined
3451 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3452 #define MONTGOMERY_SQUARING_THRESHOLD 64
3453
3454 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3455 jint len, jlong inv,
3456 jint *m_ints) {
3457 assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3458 int longwords = len/2;
3459
3460 // Make very sure we don't use so much space that the stack might
3461 // overflow. 512 jints corresponds to an 16384-bit integer and
3462 // will use here a total of 8k bytes of stack space.
3463 int divisor = sizeof(julong) * 4;
3464 guarantee(longwords <= 8192 / divisor, "must be");
3465 int total_allocation = longwords * sizeof (julong) * 4;
3466 julong *scratch = (julong *)alloca(total_allocation);
3467
3468 // Local scratch arrays
3469 julong
3470 *a = scratch + 0 * longwords,
3471 *b = scratch + 1 * longwords,
3472 *n = scratch + 2 * longwords,
3473 *m = scratch + 3 * longwords;
3474
3475 reverse_words((julong *)a_ints, a, longwords);
3476 reverse_words((julong *)b_ints, b, longwords);
3477 reverse_words((julong *)n_ints, n, longwords);
3478
3479 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3480
3481 reverse_words(m, (julong *)m_ints, longwords);
3482 }
3483
3484 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3485 jint len, jlong inv,
3486 jint *m_ints) {
3487 assert(len % 2 == 0, "array length in montgomery_square must be even");
3488 int longwords = len/2;
3489
3490 // Make very sure we don't use so much space that the stack might
3491 // overflow. 512 jints corresponds to an 16384-bit integer and
3492 // will use here a total of 6k bytes of stack space.
3493 int divisor = sizeof(julong) * 3;
3494 guarantee(longwords <= (8192 / divisor), "must be");
3495 int total_allocation = longwords * sizeof (julong) * 3;
3496 julong *scratch = (julong *)alloca(total_allocation);
3497
3498 // Local scratch arrays
3499 julong
3500 *a = scratch + 0 * longwords,
3501 *n = scratch + 1 * longwords,
3502 *m = scratch + 2 * longwords;
3503
3504 reverse_words((julong *)a_ints, a, longwords);
3505 reverse_words((julong *)n_ints, n, longwords);
3506
3507 if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3508 ::montgomery_square(a, n, m, (julong)inv, longwords);
3509 } else {
3510 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3511 }
3512
3513 reverse_words(m, (julong *)m_ints, longwords);
3514 }
3515
3516 #if INCLUDE_JFR
3517
3518 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
3519 // It returns a jobject handle to the event writer.
3520 // The handle is dereferenced and the return value is the event writer oop.
3521 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
3522 enum layout {
3523 rbp_off,
3524 rbpH_off,
3525 return_off,
3526 return_off2,
3527 framesize // inclusive of return address
3528 };
3529
3530 const char* name = SharedRuntime::stub_name(StubId::shared_jfr_write_checkpoint_id);
3531 CodeBuffer code(name, 1024, 64);
3532 MacroAssembler* masm = new MacroAssembler(&code);
3533 address start = __ pc();
3534
3535 __ enter();
3536 address the_pc = __ pc();
3537
3538 int frame_complete = the_pc - start;
3539
3540 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3541 __ movptr(c_rarg0, r15_thread);
3542 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
3543 __ reset_last_Java_frame(true);
3544
3545 // rax is jobject handle result, unpack and process it through a barrier.
3546 __ resolve_global_jobject(rax, c_rarg0);
3547
3548 __ leave();
3549 __ ret(0);
3550
3551 OopMapSet* oop_maps = new OopMapSet();
3552 OopMap* map = new OopMap(framesize, 1);
3553 oop_maps->add_gc_map(frame_complete, map);
3554
3555 RuntimeStub* stub =
3556 RuntimeStub::new_runtime_stub(name,
3557 &code,
3558 frame_complete,
3559 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3560 oop_maps,
3561 false);
3562 return stub;
3563 }
3564
3565 // For c2: call to return a leased buffer.
3566 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
3567 enum layout {
3568 rbp_off,
3569 rbpH_off,
3570 return_off,
3571 return_off2,
3572 framesize // inclusive of return address
3573 };
3574
3575 const char* name = SharedRuntime::stub_name(StubId::shared_jfr_return_lease_id);
3576 CodeBuffer code(name, 1024, 64);
3577 MacroAssembler* masm = new MacroAssembler(&code);
3578 address start = __ pc();
3579
3580 __ enter();
3581 address the_pc = __ pc();
3582
3583 int frame_complete = the_pc - start;
3584
3585 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
3586 __ movptr(c_rarg0, r15_thread);
3587 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
3588 __ reset_last_Java_frame(true);
3589
3590 __ leave();
3591 __ ret(0);
3592
3593 OopMapSet* oop_maps = new OopMapSet();
3594 OopMap* map = new OopMap(framesize, 1);
3595 oop_maps->add_gc_map(frame_complete, map);
3596
3597 RuntimeStub* stub =
3598 RuntimeStub::new_runtime_stub(name,
3599 &code,
3600 frame_complete,
3601 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3602 oop_maps,
3603 false);
3604 return stub;
3605 }
3606
3607 #endif // INCLUDE_JFR
3608
3609 RuntimeStub* SharedRuntime::generate_shenandoah_stub(StubId stub_id) {
3610 assert(UseShenandoahGC, "Only generate when Shenandoah is enabled");
3611
3612 const char* name = SharedRuntime::stub_name(stub_id);
3613 address stub_addr = nullptr;
3614 bool returns_obj = true;
3615
3616 switch (stub_id) {
3617 case StubId::shared_shenandoah_keepalive_id: {
3618 stub_addr = CAST_FROM_FN_PTR(address, ShenandoahRuntime::write_barrier_pre);
3619 returns_obj = false;
3620 break;
3621 }
3622 case StubId::shared_shenandoah_lrb_strong_id: {
3623 stub_addr = CAST_FROM_FN_PTR(address, ShenandoahRuntime::load_reference_barrier_strong);
3624 break;
3625 }
3626 case StubId::shared_shenandoah_lrb_weak_id: {
3627 stub_addr = CAST_FROM_FN_PTR(address, ShenandoahRuntime::load_reference_barrier_weak);
3628 break;
3629 }
3630 case StubId::shared_shenandoah_lrb_phantom_id: {
3631 stub_addr = CAST_FROM_FN_PTR(address, ShenandoahRuntime::load_reference_barrier_phantom);
3632 break;
3633 }
3634 case StubId::shared_shenandoah_lrb_strong_narrow_id: {
3635 stub_addr = CAST_FROM_FN_PTR(address, ShenandoahRuntime::load_reference_barrier_strong_narrow);
3636 break;
3637 }
3638 case StubId::shared_shenandoah_lrb_weak_narrow_id: {
3639 stub_addr = CAST_FROM_FN_PTR(address, ShenandoahRuntime::load_reference_barrier_weak_narrow);
3640 break;
3641 }
3642 case StubId::shared_shenandoah_lrb_phantom_narrow_id: {
3643 stub_addr = CAST_FROM_FN_PTR(address, ShenandoahRuntime::load_reference_barrier_phantom_narrow);
3644 break;
3645 }
3646 default:
3647 ShouldNotReachHere();
3648 }
3649
3650 CodeBuffer code(name, 2048, 64);
3651 MacroAssembler* masm = new MacroAssembler(&code);
3652 address start = __ pc();
3653
3654 int frame_size_in_words;
3655 OopMap* map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, true);
3656 address frame_complete_pc = __ pc();
3657
3658 address post_call_pc;
3659
3660 // Call the runtime. This is what MacroAssember::call_VM_leaf does,
3661 // but we also want to have exact post-call PC for oop map location.
3662 {
3663 Label L_stack_aligned, L_end;
3664
3665 #ifdef _WIN64
3666 // Windows always allocates space for it's register args
3667 __ subptr(rsp, frame::arg_reg_save_area_bytes);
3668 #endif
3669
3670 __ testptr(rsp, 15);
3671 __ jccb(Assembler::zero, L_stack_aligned);
3672 __ subptr(rsp, 8);
3673 __ call(RuntimeAddress(stub_addr));
3674 post_call_pc = __ pc();
3675 __ addptr(rsp, 8);
3676 __ jmpb(L_end);
3677 __ bind(L_stack_aligned);
3678 __ call(RuntimeAddress(stub_addr));
3679 post_call_pc = __ pc();
3680 __ bind(L_end);
3681
3682 #ifdef _WIN64
3683 __ addptr(rsp, frame::arg_reg_save_area_bytes);
3684 #endif
3685 }
3686
3687 if (returns_obj) {
3688 // RegisterSaver would clobber the call result when restoring.
3689 // Carry the result out of this stub by overwriting saved register.
3690 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3691 }
3692
3693 OopMapSet* oop_maps = new OopMapSet();
3694 oop_maps->add_gc_map(post_call_pc - start, map);
3695
3696 RegisterSaver::restore_live_registers(masm, true);
3697 __ ret(0);
3698
3699 return RuntimeStub::new_runtime_stub(name,
3700 &code,
3701 frame_complete_pc - start,
3702 frame_size_in_words,
3703 oop_maps,
3704 true);
3705 }