1 /*
2 * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #ifndef _WINDOWS
26 #include "alloca.h"
27 #endif
28 #include "asm/macroAssembler.hpp"
29 #include "asm/macroAssembler.inline.hpp"
30 #include "code/aotCodeCache.hpp"
31 #include "code/compiledIC.hpp"
32 #include "code/debugInfoRec.hpp"
33 #include "code/nativeInst.hpp"
34 #include "code/vtableStubs.hpp"
35 #include "compiler/oopMap.hpp"
36 #include "gc/shared/collectedHeap.hpp"
37 #include "gc/shared/gcLocker.hpp"
38 #include "gc/shared/barrierSet.hpp"
39 #include "gc/shared/barrierSetAssembler.hpp"
40 #include "interpreter/interpreter.hpp"
41 #include "logging/log.hpp"
42 #include "memory/resourceArea.hpp"
43 #include "memory/universe.hpp"
44 #include "oops/klass.inline.hpp"
45 #include "oops/method.inline.hpp"
46 #include "prims/methodHandles.hpp"
47 #include "runtime/continuation.hpp"
48 #include "runtime/continuationEntry.inline.hpp"
49 #include "runtime/globals.hpp"
50 #include "runtime/jniHandles.hpp"
51 #include "runtime/safepointMechanism.hpp"
52 #include "runtime/sharedRuntime.hpp"
53 #include "runtime/signature.hpp"
54 #include "runtime/stubRoutines.hpp"
55 #include "runtime/timerTrace.hpp"
56 #include "runtime/vframeArray.hpp"
57 #include "runtime/vm_version.hpp"
58 #include "utilities/align.hpp"
59 #include "utilities/checkedCast.hpp"
60 #include "utilities/formatBuffer.hpp"
61 #include "vmreg_x86.inline.hpp"
62 #ifdef COMPILER1
63 #include "c1/c1_Runtime1.hpp"
64 #endif
65 #ifdef COMPILER2
66 #include "opto/runtime.hpp"
67 #endif
68 #if INCLUDE_JVMCI
69 #include "jvmci/jvmciJavaClasses.hpp"
70 #endif
71
72 #define __ masm->
73
74 #ifdef PRODUCT
75 #define BLOCK_COMMENT(str) /* nothing */
76 #else
77 #define BLOCK_COMMENT(str) __ block_comment(str)
78 #endif // PRODUCT
79
80 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
81
82 class RegisterSaver {
83 // Capture info about frame layout. Layout offsets are in jint
84 // units because compiler frame slots are jints.
85 #define XSAVE_AREA_BEGIN 160
86 #define XSAVE_AREA_YMM_BEGIN 576
87 #define XSAVE_AREA_EGPRS 960
88 #define XSAVE_AREA_OPMASK_BEGIN 1088
89 #define XSAVE_AREA_ZMM_BEGIN 1152
90 #define XSAVE_AREA_UPPERBANK 1664
91 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
92 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
93 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
94 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off
95 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
96 enum layout {
97 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
98 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area
99 DEF_XMM_OFFS(0),
100 DEF_XMM_OFFS(1),
101 // 2..15 are implied in range usage
102 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
103 DEF_YMM_OFFS(0),
104 DEF_YMM_OFFS(1),
105 r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
106 r16H_off,
107 r17_off, r17H_off,
108 r18_off, r18H_off,
109 r19_off, r19H_off,
110 r20_off, r20H_off,
111 r21_off, r21H_off,
112 r22_off, r22H_off,
113 r23_off, r23H_off,
114 r24_off, r24H_off,
115 r25_off, r25H_off,
116 r26_off, r26H_off,
117 r27_off, r27H_off,
118 r28_off, r28H_off,
119 r29_off, r29H_off,
120 r30_off, r30H_off,
121 r31_off, r31H_off,
122 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
123 DEF_OPMASK_OFFS(0),
124 DEF_OPMASK_OFFS(1),
125 // 2..7 are implied in range usage
126 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
127 DEF_ZMM_OFFS(0),
128 DEF_ZMM_OFFS(1),
129 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
130 DEF_ZMM_UPPER_OFFS(16),
131 DEF_ZMM_UPPER_OFFS(17),
132 // 18..31 are implied in range usage
133 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
134 fpu_stateH_end,
135 r15_off, r15H_off,
136 r14_off, r14H_off,
137 r13_off, r13H_off,
138 r12_off, r12H_off,
139 r11_off, r11H_off,
140 r10_off, r10H_off,
141 r9_off, r9H_off,
142 r8_off, r8H_off,
143 rdi_off, rdiH_off,
144 rsi_off, rsiH_off,
145 ignore_off, ignoreH_off, // extra copy of rbp
146 rsp_off, rspH_off,
147 rbx_off, rbxH_off,
148 rdx_off, rdxH_off,
149 rcx_off, rcxH_off,
150 rax_off, raxH_off,
151 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
152 align_off, alignH_off,
153 flags_off, flagsH_off,
154 // The frame sender code expects that rbp will be in the "natural" place and
155 // will override any oopMap setting for it. We must therefore force the layout
156 // so that it agrees with the frame sender code.
157 rbp_off, rbpH_off, // copy of rbp we will restore
158 return_off, returnH_off, // slot for return address
159 reg_save_size // size in compiler stack slots
160 };
161
162 public:
163 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
164 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
165
166 // Offsets into the register save area
167 // Used by deoptimization when it is managing result register
168 // values on its own
169
170 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; }
171 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; }
172 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; }
173 static int r15_offset_in_bytes(void) { return BytesPerInt * r15_off; }
174 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; }
175 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
176
177 // During deoptimization only the result registers need to be restored,
178 // all the other values have already been extracted.
179 static void restore_result_registers(MacroAssembler* masm);
180 };
181
182 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
183 int off = 0;
184 int num_xmm_regs = XMMRegister::available_xmm_registers();
185 #if COMPILER2_OR_JVMCI
186 if (save_wide_vectors && UseAVX == 0) {
187 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
188 }
189 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
190 #else
191 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
192 #endif
193
194 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
195 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
196 // OopMap frame size is in compiler stack slots (jint's) not bytes or words
197 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
198 // CodeBlob frame size is in words.
199 int frame_size_in_words = frame_size_in_bytes / wordSize;
200 *total_frame_words = frame_size_in_words;
201
202 // Save registers, fpu state, and flags.
203 // We assume caller has already pushed the return address onto the
204 // stack, so rsp is 8-byte aligned here.
205 // We push rpb twice in this sequence because we want the real rbp
206 // to be under the return like a normal enter.
207
208 __ enter(); // rsp becomes 16-byte aligned here
209 __ pushf();
210 // Make sure rsp stays 16-byte aligned
211 __ subq(rsp, 8);
212 // Push CPU state in multiple of 16 bytes
213 __ save_legacy_gprs();
214 __ push_FPU_state();
215
216
217 // push cpu state handles this on EVEX enabled targets
218 if (save_wide_vectors) {
219 // Save upper half of YMM registers(0..15)
220 int base_addr = XSAVE_AREA_YMM_BEGIN;
221 for (int n = 0; n < 16; n++) {
222 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
223 }
224 if (VM_Version::supports_evex()) {
225 // Save upper half of ZMM registers(0..15)
226 base_addr = XSAVE_AREA_ZMM_BEGIN;
227 for (int n = 0; n < 16; n++) {
228 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
229 }
230 // Save full ZMM registers(16..num_xmm_regs)
231 base_addr = XSAVE_AREA_UPPERBANK;
232 off = 0;
233 int vector_len = Assembler::AVX_512bit;
234 for (int n = 16; n < num_xmm_regs; n++) {
235 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
236 }
237 #if COMPILER2_OR_JVMCI
238 base_addr = XSAVE_AREA_OPMASK_BEGIN;
239 off = 0;
240 for(int n = 0; n < KRegister::number_of_registers; n++) {
241 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
242 }
243 #endif
244 }
245 } else {
246 if (VM_Version::supports_evex()) {
247 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
248 int base_addr = XSAVE_AREA_UPPERBANK;
249 off = 0;
250 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit;
251 for (int n = 16; n < num_xmm_regs; n++) {
252 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
253 }
254 #if COMPILER2_OR_JVMCI
255 base_addr = XSAVE_AREA_OPMASK_BEGIN;
256 off = 0;
257 for(int n = 0; n < KRegister::number_of_registers; n++) {
258 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
259 }
260 #endif
261 }
262 }
263
264 #if COMPILER2_OR_JVMCI
265 if (UseAPX) {
266 int base_addr = XSAVE_AREA_EGPRS;
267 off = 0;
268 for (int n = 16; n < Register::number_of_registers; n++) {
269 __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
270 }
271 }
272 #endif
273
274 __ vzeroupper();
275 if (frame::arg_reg_save_area_bytes != 0) {
276 // Allocate argument register save area
277 __ subptr(rsp, frame::arg_reg_save_area_bytes);
278 }
279
280 // Set an oopmap for the call site. This oopmap will map all
281 // oop-registers and debug-info registers as callee-saved. This
282 // will allow deoptimization at this safepoint to find all possible
283 // debug-info recordings, as well as let GC find all oops.
284
285 OopMapSet *oop_maps = new OopMapSet();
286 OopMap* map = new OopMap(frame_size_in_slots, 0);
287
288 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
289
290 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
291 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
292 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
293 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
294 // rbp location is known implicitly by the frame sender code, needs no oopmap
295 // and the location where rbp was saved by is ignored
296 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
297 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
298 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg());
299 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg());
300 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
301 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
302 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
303 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
304 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
305 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
306
307 if (UseAPX) {
308 map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
309 map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
310 map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
311 map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
312 map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
313 map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
314 map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
315 map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
316 map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
317 map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
318 map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
319 map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
320 map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
321 map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
322 map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
323 map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
324 }
325 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
326 // on EVEX enabled targets, we get it included in the xsave area
327 off = xmm0_off;
328 int delta = xmm1_off - off;
329 for (int n = 0; n < 16; n++) {
330 XMMRegister xmm_name = as_XMMRegister(n);
331 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
332 off += delta;
333 }
334 if (UseAVX > 2) {
335 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
336 off = zmm16_off;
337 delta = zmm17_off - off;
338 for (int n = 16; n < num_xmm_regs; n++) {
339 XMMRegister zmm_name = as_XMMRegister(n);
340 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
341 off += delta;
342 }
343 }
344
345 #if COMPILER2_OR_JVMCI
346 if (save_wide_vectors) {
347 // Save upper half of YMM registers(0..15)
348 off = ymm0_off;
349 delta = ymm1_off - ymm0_off;
350 for (int n = 0; n < 16; n++) {
351 XMMRegister ymm_name = as_XMMRegister(n);
352 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
353 off += delta;
354 }
355 if (VM_Version::supports_evex()) {
356 // Save upper half of ZMM registers(0..15)
357 off = zmm0_off;
358 delta = zmm1_off - zmm0_off;
359 for (int n = 0; n < 16; n++) {
360 XMMRegister zmm_name = as_XMMRegister(n);
361 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
362 off += delta;
363 }
364 }
365 }
366 #endif // COMPILER2_OR_JVMCI
367
368 // %%% These should all be a waste but we'll keep things as they were for now
369 if (true) {
370 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
371 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
372 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
373 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
374 // rbp location is known implicitly by the frame sender code, needs no oopmap
375 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
376 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
377 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next());
378 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next());
379 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
380 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
381 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
382 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
383 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
384 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
385 if (UseAPX) {
386 map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
387 map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
388 map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
389 map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
390 map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
391 map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
392 map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
393 map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
394 map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
395 map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
396 map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
397 map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
398 map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
399 map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
400 map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
401 map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
402 }
403 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
404 // on EVEX enabled targets, we get it included in the xsave area
405 off = xmm0H_off;
406 delta = xmm1H_off - off;
407 for (int n = 0; n < 16; n++) {
408 XMMRegister xmm_name = as_XMMRegister(n);
409 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
410 off += delta;
411 }
412 if (UseAVX > 2) {
413 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
414 off = zmm16H_off;
415 delta = zmm17H_off - off;
416 for (int n = 16; n < num_xmm_regs; n++) {
417 XMMRegister zmm_name = as_XMMRegister(n);
418 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
419 off += delta;
420 }
421 }
422 }
423
424 return map;
425 }
426
427 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
428 int num_xmm_regs = XMMRegister::available_xmm_registers();
429 if (frame::arg_reg_save_area_bytes != 0) {
430 // Pop arg register save area
431 __ addptr(rsp, frame::arg_reg_save_area_bytes);
432 }
433
434 #if COMPILER2_OR_JVMCI
435 if (restore_wide_vectors) {
436 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
437 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
438 }
439 #else
440 assert(!restore_wide_vectors, "vectors are generated only by C2");
441 #endif
442
443 __ vzeroupper();
444
445 // On EVEX enabled targets everything is handled in pop fpu state
446 if (restore_wide_vectors) {
447 // Restore upper half of YMM registers (0..15)
448 int base_addr = XSAVE_AREA_YMM_BEGIN;
449 for (int n = 0; n < 16; n++) {
450 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
451 }
452 if (VM_Version::supports_evex()) {
453 // Restore upper half of ZMM registers (0..15)
454 base_addr = XSAVE_AREA_ZMM_BEGIN;
455 for (int n = 0; n < 16; n++) {
456 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
457 }
458 // Restore full ZMM registers(16..num_xmm_regs)
459 base_addr = XSAVE_AREA_UPPERBANK;
460 int vector_len = Assembler::AVX_512bit;
461 int off = 0;
462 for (int n = 16; n < num_xmm_regs; n++) {
463 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
464 }
465 #if COMPILER2_OR_JVMCI
466 base_addr = XSAVE_AREA_OPMASK_BEGIN;
467 off = 0;
468 for (int n = 0; n < KRegister::number_of_registers; n++) {
469 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
470 }
471 #endif
472 }
473 } else {
474 if (VM_Version::supports_evex()) {
475 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
476 int base_addr = XSAVE_AREA_UPPERBANK;
477 int off = 0;
478 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit;
479 for (int n = 16; n < num_xmm_regs; n++) {
480 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
481 }
482 #if COMPILER2_OR_JVMCI
483 base_addr = XSAVE_AREA_OPMASK_BEGIN;
484 off = 0;
485 for (int n = 0; n < KRegister::number_of_registers; n++) {
486 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
487 }
488 #endif
489 }
490 }
491
492 #if COMPILER2_OR_JVMCI
493 if (UseAPX) {
494 int base_addr = XSAVE_AREA_EGPRS;
495 int off = 0;
496 for (int n = 16; n < Register::number_of_registers; n++) {
497 __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
498 }
499 }
500 #endif
501
502 // Recover CPU state
503 __ pop_FPU_state();
504 __ restore_legacy_gprs();
505 __ addq(rsp, 8);
506 __ popf();
507 // Get the rbp described implicitly by the calling convention (no oopMap)
508 __ pop(rbp);
509 }
510
511 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
512
513 // Just restore result register. Only used by deoptimization. By
514 // now any callee save register that needs to be restored to a c2
515 // caller of the deoptee has been extracted into the vframeArray
516 // and will be stuffed into the c2i adapter we create for later
517 // restoration so only result registers need to be restored here.
518
519 // Restore fp result register
520 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
521 // Restore integer result register
522 __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
523 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
524
525 // Pop all of the register save are off the stack except the return address
526 __ addptr(rsp, return_offset_in_bytes());
527 }
528
529 // Is vector's size (in bytes) bigger than a size saved by default?
530 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
531 bool SharedRuntime::is_wide_vector(int size) {
532 return size > 16;
533 }
534
535 // ---------------------------------------------------------------------------
536 // Read the array of BasicTypes from a signature, and compute where the
537 // arguments should go. Values in the VMRegPair regs array refer to 4-byte
538 // quantities. Values less than VMRegImpl::stack0 are registers, those above
539 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer
540 // as framesizes are fixed.
541 // VMRegImpl::stack0 refers to the first slot 0(sp).
542 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
543 // Register up to Register::number_of_registers are the 64-bit
544 // integer registers.
545
546 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
547 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit
548 // units regardless of build. Of course for i486 there is no 64 bit build
549
550 // The Java calling convention is a "shifted" version of the C ABI.
551 // By skipping the first C ABI register we can call non-static jni methods
552 // with small numbers of arguments without having to shuffle the arguments
553 // at all. Since we control the java ABI we ought to at least get some
554 // advantage out of it.
555
556 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
557 VMRegPair *regs,
558 int total_args_passed) {
559
560 // Create the mapping between argument positions and
561 // registers.
562 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
563 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
564 };
565 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
566 j_farg0, j_farg1, j_farg2, j_farg3,
567 j_farg4, j_farg5, j_farg6, j_farg7
568 };
569
570
571 uint int_args = 0;
572 uint fp_args = 0;
573 uint stk_args = 0;
574
575 for (int i = 0; i < total_args_passed; i++) {
576 switch (sig_bt[i]) {
577 case T_BOOLEAN:
578 case T_CHAR:
579 case T_BYTE:
580 case T_SHORT:
581 case T_INT:
582 if (int_args < Argument::n_int_register_parameters_j) {
583 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
584 } else {
585 stk_args = align_up(stk_args, 2);
586 regs[i].set1(VMRegImpl::stack2reg(stk_args));
587 stk_args += 1;
588 }
589 break;
590 case T_VOID:
591 // halves of T_LONG or T_DOUBLE
592 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
593 regs[i].set_bad();
594 break;
595 case T_LONG:
596 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
597 // fall through
598 case T_OBJECT:
599 case T_ARRAY:
600 case T_ADDRESS:
601 if (int_args < Argument::n_int_register_parameters_j) {
602 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
603 } else {
604 stk_args = align_up(stk_args, 2);
605 regs[i].set2(VMRegImpl::stack2reg(stk_args));
606 stk_args += 2;
607 }
608 break;
609 case T_FLOAT:
610 if (fp_args < Argument::n_float_register_parameters_j) {
611 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
612 } else {
613 stk_args = align_up(stk_args, 2);
614 regs[i].set1(VMRegImpl::stack2reg(stk_args));
615 stk_args += 1;
616 }
617 break;
618 case T_DOUBLE:
619 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
620 if (fp_args < Argument::n_float_register_parameters_j) {
621 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
622 } else {
623 stk_args = align_up(stk_args, 2);
624 regs[i].set2(VMRegImpl::stack2reg(stk_args));
625 stk_args += 2;
626 }
627 break;
628 default:
629 ShouldNotReachHere();
630 break;
631 }
632 }
633
634 return stk_args;
635 }
636
637 // Patch the callers callsite with entry to compiled code if it exists.
638 static void patch_callers_callsite(MacroAssembler *masm) {
639 Label L;
640 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
641 __ jcc(Assembler::equal, L);
642
643 // Save the current stack pointer
644 __ mov(r13, rsp);
645 // Schedule the branch target address early.
646 // Call into the VM to patch the caller, then jump to compiled callee
647 // rax isn't live so capture return address while we easily can
648 __ movptr(rax, Address(rsp, 0));
649
650 // align stack so push_CPU_state doesn't fault
651 __ andptr(rsp, -(StackAlignmentInBytes));
652 __ push_CPU_state();
653 __ vzeroupper();
654 // VM needs caller's callsite
655 // VM needs target method
656 // This needs to be a long call since we will relocate this adapter to
657 // the codeBuffer and it may not reach
658
659 // Allocate argument register save area
660 if (frame::arg_reg_save_area_bytes != 0) {
661 __ subptr(rsp, frame::arg_reg_save_area_bytes);
662 }
663 __ mov(c_rarg0, rbx);
664 __ mov(c_rarg1, rax);
665 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
666
667 // De-allocate argument register save area
668 if (frame::arg_reg_save_area_bytes != 0) {
669 __ addptr(rsp, frame::arg_reg_save_area_bytes);
670 }
671
672 __ vzeroupper();
673 __ pop_CPU_state();
674 // restore sp
675 __ mov(rsp, r13);
676 __ bind(L);
677 }
678
679 static void gen_c2i_adapter(MacroAssembler *masm,
680 int total_args_passed,
681 int comp_args_on_stack,
682 const BasicType *sig_bt,
683 const VMRegPair *regs,
684 Label& skip_fixup) {
685 // Before we get into the guts of the C2I adapter, see if we should be here
686 // at all. We've come from compiled code and are attempting to jump to the
687 // interpreter, which means the caller made a static call to get here
688 // (vcalls always get a compiled target if there is one). Check for a
689 // compiled target. If there is one, we need to patch the caller's call.
690 patch_callers_callsite(masm);
691
692 __ bind(skip_fixup);
693
694 // Since all args are passed on the stack, total_args_passed *
695 // Interpreter::stackElementSize is the space we need.
696
697 assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
698
699 int extraspace = (total_args_passed * Interpreter::stackElementSize);
700
701 // stack is aligned, keep it that way
702 // This is not currently needed or enforced by the interpreter, but
703 // we might as well conform to the ABI.
704 extraspace = align_up(extraspace, 2*wordSize);
705
706 // set senderSP value
707 __ lea(r13, Address(rsp, wordSize));
708
709 #ifdef ASSERT
710 __ check_stack_alignment(r13, "sender stack not aligned");
711 #endif
712 if (extraspace > 0) {
713 // Pop the return address
714 __ pop(rax);
715
716 __ subptr(rsp, extraspace);
717
718 // Push the return address
719 __ push(rax);
720
721 // Account for the return address location since we store it first rather
722 // than hold it in a register across all the shuffling
723 extraspace += wordSize;
724 }
725
726 #ifdef ASSERT
727 __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
728 #endif
729
730 // Now write the args into the outgoing interpreter space
731 for (int i = 0; i < total_args_passed; i++) {
732 if (sig_bt[i] == T_VOID) {
733 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
734 continue;
735 }
736
737 // offset to start parameters
738 int st_off = (total_args_passed - i) * Interpreter::stackElementSize;
739 int next_off = st_off - Interpreter::stackElementSize;
740
741 // Say 4 args:
742 // i st_off
743 // 0 32 T_LONG
744 // 1 24 T_VOID
745 // 2 16 T_OBJECT
746 // 3 8 T_BOOL
747 // - 0 return address
748 //
749 // However to make thing extra confusing. Because we can fit a long/double in
750 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
751 // leaves one slot empty and only stores to a single slot. In this case the
752 // slot that is occupied is the T_VOID slot. See I said it was confusing.
753
754 VMReg r_1 = regs[i].first();
755 VMReg r_2 = regs[i].second();
756 if (!r_1->is_valid()) {
757 assert(!r_2->is_valid(), "");
758 continue;
759 }
760 if (r_1->is_stack()) {
761 // memory to memory use rax
762 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
763 if (!r_2->is_valid()) {
764 // sign extend??
765 __ movl(rax, Address(rsp, ld_off));
766 __ movptr(Address(rsp, st_off), rax);
767
768 } else {
769
770 __ movq(rax, Address(rsp, ld_off));
771
772 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
773 // T_DOUBLE and T_LONG use two slots in the interpreter
774 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
775 // ld_off == LSW, ld_off+wordSize == MSW
776 // st_off == MSW, next_off == LSW
777 __ movq(Address(rsp, next_off), rax);
778 #ifdef ASSERT
779 // Overwrite the unused slot with known junk
780 __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
781 __ movptr(Address(rsp, st_off), rax);
782 #endif /* ASSERT */
783 } else {
784 __ movq(Address(rsp, st_off), rax);
785 }
786 }
787 } else if (r_1->is_Register()) {
788 Register r = r_1->as_Register();
789 if (!r_2->is_valid()) {
790 // must be only an int (or less ) so move only 32bits to slot
791 // why not sign extend??
792 __ movl(Address(rsp, st_off), r);
793 } else {
794 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
795 // T_DOUBLE and T_LONG use two slots in the interpreter
796 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
797 // long/double in gpr
798 #ifdef ASSERT
799 // Overwrite the unused slot with known junk
800 __ mov64(rax, CONST64(0xdeadffffdeadaaab));
801 __ movptr(Address(rsp, st_off), rax);
802 #endif /* ASSERT */
803 __ movq(Address(rsp, next_off), r);
804 } else {
805 __ movptr(Address(rsp, st_off), r);
806 }
807 }
808 } else {
809 assert(r_1->is_XMMRegister(), "");
810 if (!r_2->is_valid()) {
811 // only a float use just part of the slot
812 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
813 } else {
814 #ifdef ASSERT
815 // Overwrite the unused slot with known junk
816 __ mov64(rax, CONST64(0xdeadffffdeadaaac));
817 __ movptr(Address(rsp, st_off), rax);
818 #endif /* ASSERT */
819 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
820 }
821 }
822 }
823
824 // Schedule the branch target address early.
825 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
826 __ jmp(rcx);
827 }
828
829 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
830 int total_args_passed,
831 int comp_args_on_stack,
832 const BasicType *sig_bt,
833 const VMRegPair *regs) {
834
835 // Note: r13 contains the senderSP on entry. We must preserve it since
836 // we may do a i2c -> c2i transition if we lose a race where compiled
837 // code goes non-entrant while we get args ready.
838 // In addition we use r13 to locate all the interpreter args as
839 // we must align the stack to 16 bytes on an i2c entry else we
840 // lose alignment we expect in all compiled code and register
841 // save code can segv when fxsave instructions find improperly
842 // aligned stack pointer.
843
844 // Adapters can be frameless because they do not require the caller
845 // to perform additional cleanup work, such as correcting the stack pointer.
846 // An i2c adapter is frameless because the *caller* frame, which is interpreted,
847 // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
848 // even if a callee has modified the stack pointer.
849 // A c2i adapter is frameless because the *callee* frame, which is interpreted,
850 // routinely repairs its caller's stack pointer (from sender_sp, which is set
851 // up via the senderSP register).
852 // In other words, if *either* the caller or callee is interpreted, we can
853 // get the stack pointer repaired after a call.
854 // This is why c2i and i2c adapters cannot be indefinitely composed.
855 // In particular, if a c2i adapter were to somehow call an i2c adapter,
856 // both caller and callee would be compiled methods, and neither would
857 // clean up the stack pointer changes performed by the two adapters.
858 // If this happens, control eventually transfers back to the compiled
859 // caller, but with an uncorrected stack, causing delayed havoc.
860
861 // Must preserve original SP for loading incoming arguments because
862 // we need to align the outgoing SP for compiled code.
863 __ movptr(r11, rsp);
864
865 // Pick up the return address
866 __ pop(rax);
867
868 // Convert 4-byte c2 stack slots to words.
869 int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
870
871 if (comp_args_on_stack) {
872 __ subptr(rsp, comp_words_on_stack * wordSize);
873 }
874
875 // Ensure compiled code always sees stack at proper alignment
876 __ andptr(rsp, -16);
877
878 // push the return address and misalign the stack that youngest frame always sees
879 // as far as the placement of the call instruction
880 __ push(rax);
881
882 // Put saved SP in another register
883 const Register saved_sp = rax;
884 __ movptr(saved_sp, r11);
885
886 // Will jump to the compiled code just as if compiled code was doing it.
887 // Pre-load the register-jump target early, to schedule it better.
888 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
889
890 #if INCLUDE_JVMCI
891 if (EnableJVMCI) {
892 // check if this call should be routed towards a specific entry point
893 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
894 Label no_alternative_target;
895 __ jcc(Assembler::equal, no_alternative_target);
896 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
897 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
898 __ bind(no_alternative_target);
899 }
900 #endif // INCLUDE_JVMCI
901
902 // Now generate the shuffle code. Pick up all register args and move the
903 // rest through the floating point stack top.
904 for (int i = 0; i < total_args_passed; i++) {
905 if (sig_bt[i] == T_VOID) {
906 // Longs and doubles are passed in native word order, but misaligned
907 // in the 32-bit build.
908 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
909 continue;
910 }
911
912 // Pick up 0, 1 or 2 words from SP+offset.
913
914 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
915 "scrambled load targets?");
916 // Load in argument order going down.
917 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
918 // Point to interpreter value (vs. tag)
919 int next_off = ld_off - Interpreter::stackElementSize;
920 //
921 //
922 //
923 VMReg r_1 = regs[i].first();
924 VMReg r_2 = regs[i].second();
925 if (!r_1->is_valid()) {
926 assert(!r_2->is_valid(), "");
927 continue;
928 }
929 if (r_1->is_stack()) {
930 // Convert stack slot to an SP offset (+ wordSize to account for return address )
931 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
932
933 // We can use r13 as a temp here because compiled code doesn't need r13 as an input
934 // and if we end up going thru a c2i because of a miss a reasonable value of r13
935 // will be generated.
936 if (!r_2->is_valid()) {
937 // sign extend???
938 __ movl(r13, Address(saved_sp, ld_off));
939 __ movptr(Address(rsp, st_off), r13);
940 } else {
941 //
942 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
943 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
944 // So we must adjust where to pick up the data to match the interpreter.
945 //
946 // Interpreter local[n] == MSW, local[n+1] == LSW however locals
947 // are accessed as negative so LSW is at LOW address
948
949 // ld_off is MSW so get LSW
950 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
951 next_off : ld_off;
952 __ movq(r13, Address(saved_sp, offset));
953 // st_off is LSW (i.e. reg.first())
954 __ movq(Address(rsp, st_off), r13);
955 }
956 } else if (r_1->is_Register()) { // Register argument
957 Register r = r_1->as_Register();
958 assert(r != rax, "must be different");
959 if (r_2->is_valid()) {
960 //
961 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
962 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
963 // So we must adjust where to pick up the data to match the interpreter.
964
965 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
966 next_off : ld_off;
967
968 // this can be a misaligned move
969 __ movq(r, Address(saved_sp, offset));
970 } else {
971 // sign extend and use a full word?
972 __ movl(r, Address(saved_sp, ld_off));
973 }
974 } else {
975 if (!r_2->is_valid()) {
976 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
977 } else {
978 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
979 }
980 }
981 }
982
983 __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
984
985 // 6243940 We might end up in handle_wrong_method if
986 // the callee is deoptimized as we race thru here. If that
987 // happens we don't want to take a safepoint because the
988 // caller frame will look interpreted and arguments are now
989 // "compiled" so it is much better to make this transition
990 // invisible to the stack walking code. Unfortunately if
991 // we try and find the callee by normal means a safepoint
992 // is possible. So we stash the desired callee in the thread
993 // and the vm will find there should this case occur.
994
995 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
996
997 // put Method* where a c2i would expect should we end up there
998 // only needed because eof c2 resolve stubs return Method* as a result in
999 // rax
1000 __ mov(rax, rbx);
1001 __ jmp(r11);
1002 }
1003
1004 // ---------------------------------------------------------------
1005 void SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
1006 int total_args_passed,
1007 int comp_args_on_stack,
1008 const BasicType *sig_bt,
1009 const VMRegPair *regs,
1010 address entry_address[AdapterBlob::ENTRY_COUNT]) {
1011 entry_address[AdapterBlob::I2C] = __ pc();
1012
1013 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
1014
1015 // -------------------------------------------------------------------------
1016 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls
1017 // to the interpreter. The args start out packed in the compiled layout. They
1018 // need to be unpacked into the interpreter layout. This will almost always
1019 // require some stack space. We grow the current (compiled) stack, then repack
1020 // the args. We finally end in a jump to the generic interpreter entry point.
1021 // On exit from the interpreter, the interpreter will restore our SP (lest the
1022 // compiled code, which relies solely on SP and not RBP, get sick).
1023
1024 entry_address[AdapterBlob::C2I_Unverified] = __ pc();
1025 Label skip_fixup;
1026
1027 Register data = rax;
1028 Register receiver = j_rarg0;
1029 Register temp = rbx;
1030
1031 {
1032 __ ic_check(1 /* end_alignment */);
1033 __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1034 // Method might have been compiled since the call site was patched to
1035 // interpreted if that is the case treat it as a miss so we can get
1036 // the call site corrected.
1037 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1038 __ jcc(Assembler::equal, skip_fixup);
1039 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1040 }
1041
1042 entry_address[AdapterBlob::C2I] = __ pc();
1043
1044 // Class initialization barrier for static methods
1045 entry_address[AdapterBlob::C2I_No_Clinit_Check] = nullptr;
1046 assert(VM_Version::supports_fast_class_init_checks(), "sanity");
1047 Label L_skip_barrier;
1048 Register method = rbx;
1049
1050 // Bypass the barrier for non-static methods
1051 Register flags = rscratch1;
1052 __ load_unsigned_short(flags, Address(method, Method::access_flags_offset()));
1053 __ testl(flags, JVM_ACC_STATIC);
1054 __ jcc(Assembler::zero, L_skip_barrier); // non-static
1055
1056 Register klass = rscratch1;
1057 __ load_method_holder(klass, method);
1058 __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
1059
1060 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1061
1062 __ bind(L_skip_barrier);
1063 entry_address[AdapterBlob::C2I_No_Clinit_Check] = __ pc();
1064
1065 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1066 bs->c2i_entry_barrier(masm);
1067
1068 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1069 return;
1070 }
1071
1072 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1073 VMRegPair *regs,
1074 int total_args_passed) {
1075
1076 // We return the amount of VMRegImpl stack slots we need to reserve for all
1077 // the arguments NOT counting out_preserve_stack_slots.
1078
1079 // NOTE: These arrays will have to change when c1 is ported
1080 #ifdef _WIN64
1081 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1082 c_rarg0, c_rarg1, c_rarg2, c_rarg3
1083 };
1084 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1085 c_farg0, c_farg1, c_farg2, c_farg3
1086 };
1087 #else
1088 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1089 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1090 };
1091 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1092 c_farg0, c_farg1, c_farg2, c_farg3,
1093 c_farg4, c_farg5, c_farg6, c_farg7
1094 };
1095 #endif // _WIN64
1096
1097
1098 uint int_args = 0;
1099 uint fp_args = 0;
1100 uint stk_args = 0; // inc by 2 each time
1101
1102 for (int i = 0; i < total_args_passed; i++) {
1103 switch (sig_bt[i]) {
1104 case T_BOOLEAN:
1105 case T_CHAR:
1106 case T_BYTE:
1107 case T_SHORT:
1108 case T_INT:
1109 if (int_args < Argument::n_int_register_parameters_c) {
1110 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1111 #ifdef _WIN64
1112 fp_args++;
1113 // Allocate slots for callee to stuff register args the stack.
1114 stk_args += 2;
1115 #endif
1116 } else {
1117 regs[i].set1(VMRegImpl::stack2reg(stk_args));
1118 stk_args += 2;
1119 }
1120 break;
1121 case T_LONG:
1122 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1123 // fall through
1124 case T_OBJECT:
1125 case T_ARRAY:
1126 case T_ADDRESS:
1127 case T_METADATA:
1128 if (int_args < Argument::n_int_register_parameters_c) {
1129 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1130 #ifdef _WIN64
1131 fp_args++;
1132 stk_args += 2;
1133 #endif
1134 } else {
1135 regs[i].set2(VMRegImpl::stack2reg(stk_args));
1136 stk_args += 2;
1137 }
1138 break;
1139 case T_FLOAT:
1140 if (fp_args < Argument::n_float_register_parameters_c) {
1141 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1142 #ifdef _WIN64
1143 int_args++;
1144 // Allocate slots for callee to stuff register args the stack.
1145 stk_args += 2;
1146 #endif
1147 } else {
1148 regs[i].set1(VMRegImpl::stack2reg(stk_args));
1149 stk_args += 2;
1150 }
1151 break;
1152 case T_DOUBLE:
1153 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1154 if (fp_args < Argument::n_float_register_parameters_c) {
1155 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1156 #ifdef _WIN64
1157 int_args++;
1158 // Allocate slots for callee to stuff register args the stack.
1159 stk_args += 2;
1160 #endif
1161 } else {
1162 regs[i].set2(VMRegImpl::stack2reg(stk_args));
1163 stk_args += 2;
1164 }
1165 break;
1166 case T_VOID: // Halves of longs and doubles
1167 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1168 regs[i].set_bad();
1169 break;
1170 default:
1171 ShouldNotReachHere();
1172 break;
1173 }
1174 }
1175 #ifdef _WIN64
1176 // windows abi requires that we always allocate enough stack space
1177 // for 4 64bit registers to be stored down.
1178 if (stk_args < 8) {
1179 stk_args = 8;
1180 }
1181 #endif // _WIN64
1182
1183 return stk_args;
1184 }
1185
1186 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1187 uint num_bits,
1188 uint total_args_passed) {
1189 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1190 "only certain vector sizes are supported for now");
1191
1192 static const XMMRegister VEC_ArgReg[32] = {
1193 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
1194 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1195 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1196 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1197 };
1198
1199 uint stk_args = 0;
1200 uint fp_args = 0;
1201
1202 for (uint i = 0; i < total_args_passed; i++) {
1203 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1204 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15));
1205 regs[i].set_pair(vmreg->next(next_val), vmreg);
1206 }
1207
1208 return stk_args;
1209 }
1210
1211 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1212 // We always ignore the frame_slots arg and just use the space just below frame pointer
1213 // which by this time is free to use
1214 switch (ret_type) {
1215 case T_FLOAT:
1216 __ movflt(Address(rbp, -wordSize), xmm0);
1217 break;
1218 case T_DOUBLE:
1219 __ movdbl(Address(rbp, -wordSize), xmm0);
1220 break;
1221 case T_VOID: break;
1222 default: {
1223 __ movptr(Address(rbp, -wordSize), rax);
1224 }
1225 }
1226 }
1227
1228 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1229 // We always ignore the frame_slots arg and just use the space just below frame pointer
1230 // which by this time is free to use
1231 switch (ret_type) {
1232 case T_FLOAT:
1233 __ movflt(xmm0, Address(rbp, -wordSize));
1234 break;
1235 case T_DOUBLE:
1236 __ movdbl(xmm0, Address(rbp, -wordSize));
1237 break;
1238 case T_VOID: break;
1239 default: {
1240 __ movptr(rax, Address(rbp, -wordSize));
1241 }
1242 }
1243 }
1244
1245 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1246 for ( int i = first_arg ; i < arg_count ; i++ ) {
1247 if (args[i].first()->is_Register()) {
1248 __ push(args[i].first()->as_Register());
1249 } else if (args[i].first()->is_XMMRegister()) {
1250 __ subptr(rsp, 2*wordSize);
1251 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1252 }
1253 }
1254 }
1255
1256 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1257 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1258 if (args[i].first()->is_Register()) {
1259 __ pop(args[i].first()->as_Register());
1260 } else if (args[i].first()->is_XMMRegister()) {
1261 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1262 __ addptr(rsp, 2*wordSize);
1263 }
1264 }
1265 }
1266
1267 static void verify_oop_args(MacroAssembler* masm,
1268 const methodHandle& method,
1269 const BasicType* sig_bt,
1270 const VMRegPair* regs) {
1271 Register temp_reg = rbx; // not part of any compiled calling seq
1272 if (VerifyOops) {
1273 for (int i = 0; i < method->size_of_parameters(); i++) {
1274 if (is_reference_type(sig_bt[i])) {
1275 VMReg r = regs[i].first();
1276 assert(r->is_valid(), "bad oop arg");
1277 if (r->is_stack()) {
1278 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1279 __ verify_oop(temp_reg);
1280 } else {
1281 __ verify_oop(r->as_Register());
1282 }
1283 }
1284 }
1285 }
1286 }
1287
1288 static void check_continuation_enter_argument(VMReg actual_vmreg,
1289 Register expected_reg,
1290 const char* name) {
1291 assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1292 assert(actual_vmreg->as_Register() == expected_reg,
1293 "%s is in unexpected register: %s instead of %s",
1294 name, actual_vmreg->as_Register()->name(), expected_reg->name());
1295 }
1296
1297
1298 //---------------------------- continuation_enter_setup ---------------------------
1299 //
1300 // Arguments:
1301 // None.
1302 //
1303 // Results:
1304 // rsp: pointer to blank ContinuationEntry
1305 //
1306 // Kills:
1307 // rax
1308 //
1309 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1310 assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1311 assert(in_bytes(ContinuationEntry::cont_offset()) % VMRegImpl::stack_slot_size == 0, "");
1312 assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1313
1314 stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1315 __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1316
1317 int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1318 OopMap* map = new OopMap(frame_size, 0);
1319
1320 __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1321 __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1322 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1323
1324 return map;
1325 }
1326
1327 //---------------------------- fill_continuation_entry ---------------------------
1328 //
1329 // Arguments:
1330 // rsp: pointer to blank Continuation entry
1331 // reg_cont_obj: pointer to the continuation
1332 // reg_flags: flags
1333 //
1334 // Results:
1335 // rsp: pointer to filled out ContinuationEntry
1336 //
1337 // Kills:
1338 // rax
1339 //
1340 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1341 assert_different_registers(rax, reg_cont_obj, reg_flags);
1342 #ifdef ASSERT
1343 __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1344 #endif
1345 __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1346 __ movl (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1347 __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1348 __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1349 __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1350
1351 __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1352 __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1353
1354 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1355 }
1356
1357 //---------------------------- continuation_enter_cleanup ---------------------------
1358 //
1359 // Arguments:
1360 // rsp: pointer to the ContinuationEntry
1361 //
1362 // Results:
1363 // rsp: pointer to the spilled rbp in the entry frame
1364 //
1365 // Kills:
1366 // rbx
1367 //
1368 static void continuation_enter_cleanup(MacroAssembler* masm) {
1369 #ifdef ASSERT
1370 Label L_good_sp;
1371 __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1372 __ jcc(Assembler::equal, L_good_sp);
1373 __ stop("Incorrect rsp at continuation_enter_cleanup");
1374 __ bind(L_good_sp);
1375 #endif
1376 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1377 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1378 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1379 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1380 __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1381 }
1382
1383 static void gen_continuation_enter(MacroAssembler* masm,
1384 const VMRegPair* regs,
1385 int& exception_offset,
1386 OopMapSet* oop_maps,
1387 int& frame_complete,
1388 int& stack_slots,
1389 int& interpreted_entry_offset,
1390 int& compiled_entry_offset) {
1391
1392 // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1393 int pos_cont_obj = 0;
1394 int pos_is_cont = 1;
1395 int pos_is_virtual = 2;
1396
1397 // The platform-specific calling convention may present the arguments in various registers.
1398 // To simplify the rest of the code, we expect the arguments to reside at these known
1399 // registers, and we additionally check the placement here in case calling convention ever
1400 // changes.
1401 Register reg_cont_obj = c_rarg1;
1402 Register reg_is_cont = c_rarg2;
1403 Register reg_is_virtual = c_rarg3;
1404
1405 check_continuation_enter_argument(regs[pos_cont_obj].first(), reg_cont_obj, "Continuation object");
1406 check_continuation_enter_argument(regs[pos_is_cont].first(), reg_is_cont, "isContinue");
1407 check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1408
1409 // Utility methods kill rax, make sure there are no collisions
1410 assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1411
1412 AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1413 relocInfo::static_call_type);
1414
1415 address start = __ pc();
1416
1417 Label L_thaw, L_exit;
1418
1419 // i2i entry used at interp_only_mode only
1420 interpreted_entry_offset = __ pc() - start;
1421 {
1422 #ifdef ASSERT
1423 Label is_interp_only;
1424 __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1425 __ jcc(Assembler::notEqual, is_interp_only);
1426 __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1427 __ bind(is_interp_only);
1428 #endif
1429
1430 __ pop(rax); // return address
1431 // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1432 __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1433 __ movl(c_rarg2, Address(rsp, Interpreter::stackElementSize*1));
1434 __ movl(c_rarg3, Address(rsp, Interpreter::stackElementSize*0));
1435 __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1436 __ push(rax); // return address
1437 __ push_cont_fastpath();
1438
1439 __ enter();
1440
1441 stack_slots = 2; // will be adjusted in setup
1442 OopMap* map = continuation_enter_setup(masm, stack_slots);
1443 // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1444 // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1445
1446 __ verify_oop(reg_cont_obj);
1447
1448 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1449
1450 // If continuation, call to thaw. Otherwise, resolve the call and exit.
1451 __ testptr(reg_is_cont, reg_is_cont);
1452 __ jcc(Assembler::notZero, L_thaw);
1453
1454 // --- Resolve path
1455
1456 // Make sure the call is patchable
1457 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1458 // Emit stub for static call
1459 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1460 if (stub == nullptr) {
1461 fatal("CodeCache is full at gen_continuation_enter");
1462 }
1463 __ call(resolve);
1464 oop_maps->add_gc_map(__ pc() - start, map);
1465 __ post_call_nop();
1466
1467 __ jmp(L_exit);
1468 }
1469
1470 // compiled entry
1471 __ align(CodeEntryAlignment);
1472 compiled_entry_offset = __ pc() - start;
1473 __ enter();
1474
1475 stack_slots = 2; // will be adjusted in setup
1476 OopMap* map = continuation_enter_setup(masm, stack_slots);
1477
1478 // Frame is now completed as far as size and linkage.
1479 frame_complete = __ pc() - start;
1480
1481 __ verify_oop(reg_cont_obj);
1482
1483 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1484
1485 // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1486 __ testptr(reg_is_cont, reg_is_cont);
1487 __ jccb(Assembler::notZero, L_thaw);
1488
1489 // --- call Continuation.enter(Continuation c, boolean isContinue)
1490
1491 // Make sure the call is patchable
1492 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1493
1494 // Emit stub for static call
1495 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1496 if (stub == nullptr) {
1497 fatal("CodeCache is full at gen_continuation_enter");
1498 }
1499
1500 // The call needs to be resolved. There's a special case for this in
1501 // SharedRuntime::find_callee_info_helper() which calls
1502 // LinkResolver::resolve_continuation_enter() which resolves the call to
1503 // Continuation.enter(Continuation c, boolean isContinue).
1504 __ call(resolve);
1505
1506 oop_maps->add_gc_map(__ pc() - start, map);
1507 __ post_call_nop();
1508
1509 __ jmpb(L_exit);
1510
1511 // --- Thawing path
1512
1513 __ bind(L_thaw);
1514
1515 ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1516 __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1517
1518 ContinuationEntry::_return_pc_offset = __ pc() - start;
1519 oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1520 __ post_call_nop();
1521
1522 // --- Normal exit (resolve/thawing)
1523
1524 __ bind(L_exit);
1525 ContinuationEntry::_cleanup_offset = __ pc() - start;
1526 continuation_enter_cleanup(masm);
1527 __ pop(rbp);
1528 __ ret(0);
1529
1530 // --- Exception handling path
1531
1532 exception_offset = __ pc() - start;
1533
1534 continuation_enter_cleanup(masm);
1535 __ pop(rbp);
1536
1537 __ movptr(c_rarg0, r15_thread);
1538 __ movptr(c_rarg1, Address(rsp, 0)); // return address
1539
1540 // rax still holds the original exception oop, save it before the call
1541 __ push(rax);
1542
1543 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1544 __ movptr(rbx, rax);
1545
1546 // Continue at exception handler:
1547 // rax: exception oop
1548 // rbx: exception handler
1549 // rdx: exception pc
1550 __ pop(rax);
1551 __ verify_oop(rax);
1552 __ pop(rdx);
1553 __ jmp(rbx);
1554 }
1555
1556 static void gen_continuation_yield(MacroAssembler* masm,
1557 const VMRegPair* regs,
1558 OopMapSet* oop_maps,
1559 int& frame_complete,
1560 int& stack_slots,
1561 int& compiled_entry_offset) {
1562 enum layout {
1563 rbp_off,
1564 rbpH_off,
1565 return_off,
1566 return_off2,
1567 framesize // inclusive of return address
1568 };
1569 stack_slots = framesize / VMRegImpl::slots_per_word;
1570 assert(stack_slots == 2, "recheck layout");
1571
1572 address start = __ pc();
1573 compiled_entry_offset = __ pc() - start;
1574 __ enter();
1575 address the_pc = __ pc();
1576
1577 frame_complete = the_pc - start;
1578
1579 // This nop must be exactly at the PC we push into the frame info.
1580 // We use this nop for fast CodeBlob lookup, associate the OopMap
1581 // with it right away.
1582 __ post_call_nop();
1583 OopMap* map = new OopMap(framesize, 1);
1584 oop_maps->add_gc_map(frame_complete, map);
1585
1586 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1587 __ movptr(c_rarg0, r15_thread);
1588 __ movptr(c_rarg1, rsp);
1589 __ call_VM_leaf(Continuation::freeze_entry(), 2);
1590 __ reset_last_Java_frame(true);
1591
1592 Label L_pinned;
1593
1594 __ testptr(rax, rax);
1595 __ jcc(Assembler::notZero, L_pinned);
1596
1597 __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1598 continuation_enter_cleanup(masm);
1599 __ pop(rbp);
1600 __ ret(0);
1601
1602 __ bind(L_pinned);
1603
1604 // Pinned, return to caller
1605
1606 // handle pending exception thrown by freeze
1607 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1608 Label ok;
1609 __ jcc(Assembler::equal, ok);
1610 __ leave();
1611 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1612 __ bind(ok);
1613
1614 __ leave();
1615 __ ret(0);
1616 }
1617
1618 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1619 ::continuation_enter_cleanup(masm);
1620 }
1621
1622 static void gen_special_dispatch(MacroAssembler* masm,
1623 const methodHandle& method,
1624 const BasicType* sig_bt,
1625 const VMRegPair* regs) {
1626 verify_oop_args(masm, method, sig_bt, regs);
1627 vmIntrinsics::ID iid = method->intrinsic_id();
1628
1629 // Now write the args into the outgoing interpreter space
1630 bool has_receiver = false;
1631 Register receiver_reg = noreg;
1632 int member_arg_pos = -1;
1633 Register member_reg = noreg;
1634 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1635 if (ref_kind != 0) {
1636 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument
1637 member_reg = rbx; // known to be free at this point
1638 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1639 } else if (iid == vmIntrinsics::_invokeBasic) {
1640 has_receiver = true;
1641 } else if (iid == vmIntrinsics::_linkToNative) {
1642 member_arg_pos = method->size_of_parameters() - 1; // trailing NativeEntryPoint argument
1643 member_reg = rbx; // known to be free at this point
1644 } else {
1645 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1646 }
1647
1648 if (member_reg != noreg) {
1649 // Load the member_arg into register, if necessary.
1650 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1651 VMReg r = regs[member_arg_pos].first();
1652 if (r->is_stack()) {
1653 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1654 } else {
1655 // no data motion is needed
1656 member_reg = r->as_Register();
1657 }
1658 }
1659
1660 if (has_receiver) {
1661 // Make sure the receiver is loaded into a register.
1662 assert(method->size_of_parameters() > 0, "oob");
1663 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1664 VMReg r = regs[0].first();
1665 assert(r->is_valid(), "bad receiver arg");
1666 if (r->is_stack()) {
1667 // Porting note: This assumes that compiled calling conventions always
1668 // pass the receiver oop in a register. If this is not true on some
1669 // platform, pick a temp and load the receiver from stack.
1670 fatal("receiver always in a register");
1671 receiver_reg = j_rarg0; // known to be free at this point
1672 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1673 } else {
1674 // no data motion is needed
1675 receiver_reg = r->as_Register();
1676 }
1677 }
1678
1679 // Figure out which address we are really jumping to:
1680 MethodHandles::generate_method_handle_dispatch(masm, iid,
1681 receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1682 }
1683
1684 // ---------------------------------------------------------------------------
1685 // Generate a native wrapper for a given method. The method takes arguments
1686 // in the Java compiled code convention, marshals them to the native
1687 // convention (handlizes oops, etc), transitions to native, makes the call,
1688 // returns to java state (possibly blocking), unhandlizes any result and
1689 // returns.
1690 //
1691 // Critical native functions are a shorthand for the use of
1692 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1693 // functions. The wrapper is expected to unpack the arguments before
1694 // passing them to the callee. Critical native functions leave the state _in_Java,
1695 // since they cannot stop for GC.
1696 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1697 // block and the check for pending exceptions it's impossible for them
1698 // to be thrown.
1699 //
1700 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1701 const methodHandle& method,
1702 int compile_id,
1703 BasicType* in_sig_bt,
1704 VMRegPair* in_regs,
1705 BasicType ret_type) {
1706 if (method->is_continuation_native_intrinsic()) {
1707 int exception_offset = -1;
1708 OopMapSet* oop_maps = new OopMapSet();
1709 int frame_complete = -1;
1710 int stack_slots = -1;
1711 int interpreted_entry_offset = -1;
1712 int vep_offset = -1;
1713 if (method->is_continuation_enter_intrinsic()) {
1714 gen_continuation_enter(masm,
1715 in_regs,
1716 exception_offset,
1717 oop_maps,
1718 frame_complete,
1719 stack_slots,
1720 interpreted_entry_offset,
1721 vep_offset);
1722 } else if (method->is_continuation_yield_intrinsic()) {
1723 gen_continuation_yield(masm,
1724 in_regs,
1725 oop_maps,
1726 frame_complete,
1727 stack_slots,
1728 vep_offset);
1729 } else {
1730 guarantee(false, "Unknown Continuation native intrinsic");
1731 }
1732
1733 #ifdef ASSERT
1734 if (method->is_continuation_enter_intrinsic()) {
1735 assert(interpreted_entry_offset != -1, "Must be set");
1736 assert(exception_offset != -1, "Must be set");
1737 } else {
1738 assert(interpreted_entry_offset == -1, "Must be unset");
1739 assert(exception_offset == -1, "Must be unset");
1740 }
1741 assert(frame_complete != -1, "Must be set");
1742 assert(stack_slots != -1, "Must be set");
1743 assert(vep_offset != -1, "Must be set");
1744 #endif
1745
1746 __ flush();
1747 nmethod* nm = nmethod::new_native_nmethod(method,
1748 compile_id,
1749 masm->code(),
1750 vep_offset,
1751 frame_complete,
1752 stack_slots,
1753 in_ByteSize(-1),
1754 in_ByteSize(-1),
1755 oop_maps,
1756 exception_offset);
1757 if (nm == nullptr) return nm;
1758 if (method->is_continuation_enter_intrinsic()) {
1759 ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1760 } else if (method->is_continuation_yield_intrinsic()) {
1761 _cont_doYield_stub = nm;
1762 }
1763 return nm;
1764 }
1765
1766 if (method->is_method_handle_intrinsic()) {
1767 vmIntrinsics::ID iid = method->intrinsic_id();
1768 intptr_t start = (intptr_t)__ pc();
1769 int vep_offset = ((intptr_t)__ pc()) - start;
1770 gen_special_dispatch(masm,
1771 method,
1772 in_sig_bt,
1773 in_regs);
1774 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period
1775 __ flush();
1776 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually
1777 return nmethod::new_native_nmethod(method,
1778 compile_id,
1779 masm->code(),
1780 vep_offset,
1781 frame_complete,
1782 stack_slots / VMRegImpl::slots_per_word,
1783 in_ByteSize(-1),
1784 in_ByteSize(-1),
1785 nullptr);
1786 }
1787 address native_func = method->native_function();
1788 assert(native_func != nullptr, "must have function");
1789
1790 // An OopMap for lock (and class if static)
1791 OopMapSet *oop_maps = new OopMapSet();
1792 intptr_t start = (intptr_t)__ pc();
1793
1794 // We have received a description of where all the java arg are located
1795 // on entry to the wrapper. We need to convert these args to where
1796 // the jni function will expect them. To figure out where they go
1797 // we convert the java signature to a C signature by inserting
1798 // the hidden arguments as arg[0] and possibly arg[1] (static method)
1799
1800 const int total_in_args = method->size_of_parameters();
1801 int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1802
1803 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1804 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1805
1806 int argc = 0;
1807 out_sig_bt[argc++] = T_ADDRESS;
1808 if (method->is_static()) {
1809 out_sig_bt[argc++] = T_OBJECT;
1810 }
1811
1812 for (int i = 0; i < total_in_args ; i++ ) {
1813 out_sig_bt[argc++] = in_sig_bt[i];
1814 }
1815
1816 // Now figure out where the args must be stored and how much stack space
1817 // they require.
1818 int out_arg_slots;
1819 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
1820
1821 // Compute framesize for the wrapper. We need to handlize all oops in
1822 // incoming registers
1823
1824 // Calculate the total number of stack slots we will need.
1825
1826 // First count the abi requirement plus all of the outgoing args
1827 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1828
1829 // Now the space for the inbound oop handle area
1830 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers
1831
1832 int oop_handle_offset = stack_slots;
1833 stack_slots += total_save_slots;
1834
1835 // Now any space we need for handlizing a klass if static method
1836
1837 int klass_slot_offset = 0;
1838 int klass_offset = -1;
1839 int lock_slot_offset = 0;
1840 bool is_static = false;
1841
1842 if (method->is_static()) {
1843 klass_slot_offset = stack_slots;
1844 stack_slots += VMRegImpl::slots_per_word;
1845 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1846 is_static = true;
1847 }
1848
1849 // Plus a lock if needed
1850
1851 if (method->is_synchronized()) {
1852 lock_slot_offset = stack_slots;
1853 stack_slots += VMRegImpl::slots_per_word;
1854 }
1855
1856 // Now a place (+2) to save return values or temp during shuffling
1857 // + 4 for return address (which we own) and saved rbp
1858 stack_slots += 6;
1859
1860 // Ok The space we have allocated will look like:
1861 //
1862 //
1863 // FP-> | |
1864 // |---------------------|
1865 // | 2 slots for moves |
1866 // |---------------------|
1867 // | lock box (if sync) |
1868 // |---------------------| <- lock_slot_offset
1869 // | klass (if static) |
1870 // |---------------------| <- klass_slot_offset
1871 // | oopHandle area |
1872 // |---------------------| <- oop_handle_offset (6 java arg registers)
1873 // | outbound memory |
1874 // | based arguments |
1875 // | |
1876 // |---------------------|
1877 // | |
1878 // SP-> | out_preserved_slots |
1879 //
1880 //
1881
1882
1883 // Now compute actual number of stack words we need rounding to make
1884 // stack properly aligned.
1885 stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1886
1887 int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1888
1889 // First thing make an ic check to see if we should even be here
1890
1891 // We are free to use all registers as temps without saving them and
1892 // restoring them except rbp. rbp is the only callee save register
1893 // as far as the interpreter and the compiler(s) are concerned.
1894
1895 const Register receiver = j_rarg0;
1896
1897 Label exception_pending;
1898
1899 assert_different_registers(receiver, rscratch1, rscratch2);
1900 __ verify_oop(receiver);
1901 __ ic_check(8 /* end_alignment */);
1902
1903 int vep_offset = ((intptr_t)__ pc()) - start;
1904
1905 if (method->needs_clinit_barrier()) {
1906 assert(VM_Version::supports_fast_class_init_checks(), "sanity");
1907 Label L_skip_barrier;
1908 Register klass = r10;
1909 __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1910 __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
1911
1912 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1913
1914 __ bind(L_skip_barrier);
1915 }
1916
1917 #ifdef COMPILER1
1918 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1919 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1920 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1921 }
1922 #endif // COMPILER1
1923
1924 // The instruction at the verified entry point must be 5 bytes or longer
1925 // because it can be patched on the fly by make_non_entrant. The stack bang
1926 // instruction fits that requirement.
1927
1928 // Generate stack overflow check
1929 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1930
1931 // Generate a new frame for the wrapper.
1932 __ enter();
1933 // -2 because return address is already present and so is saved rbp
1934 __ subptr(rsp, stack_size - 2*wordSize);
1935
1936 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1937 // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
1938 bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
1939
1940 // Frame is now completed as far as size and linkage.
1941 int frame_complete = ((intptr_t)__ pc()) - start;
1942
1943 #ifdef ASSERT
1944 __ check_stack_alignment(rsp, "improperly aligned stack");
1945 #endif /* ASSERT */
1946
1947
1948 // We use r14 as the oop handle for the receiver/klass
1949 // It is callee save so it survives the call to native
1950
1951 const Register oop_handle_reg = r14;
1952
1953 //
1954 // We immediately shuffle the arguments so that any vm call we have to
1955 // make from here on out (sync slow path, jvmti, etc.) we will have
1956 // captured the oops from our caller and have a valid oopMap for
1957 // them.
1958
1959 // -----------------
1960 // The Grand Shuffle
1961
1962 // The Java calling convention is either equal (linux) or denser (win64) than the
1963 // c calling convention. However the because of the jni_env argument the c calling
1964 // convention always has at least one more (and two for static) arguments than Java.
1965 // Therefore if we move the args from java -> c backwards then we will never have
1966 // a register->register conflict and we don't have to build a dependency graph
1967 // and figure out how to break any cycles.
1968 //
1969
1970 // Record esp-based slot for receiver on stack for non-static methods
1971 int receiver_offset = -1;
1972
1973 // This is a trick. We double the stack slots so we can claim
1974 // the oops in the caller's frame. Since we are sure to have
1975 // more args than the caller doubling is enough to make
1976 // sure we can capture all the incoming oop args from the
1977 // caller.
1978 //
1979 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1980
1981 // Mark location of rbp (someday)
1982 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1983
1984 // Use eax, ebx as temporaries during any memory-memory moves we have to do
1985 // All inbound args are referenced based on rbp and all outbound args via rsp.
1986
1987
1988 #ifdef ASSERT
1989 bool reg_destroyed[Register::number_of_registers];
1990 bool freg_destroyed[XMMRegister::number_of_registers];
1991 for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
1992 reg_destroyed[r] = false;
1993 }
1994 for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
1995 freg_destroyed[f] = false;
1996 }
1997
1998 #endif /* ASSERT */
1999
2000 // For JNI natives the incoming and outgoing registers are offset upwards.
2001 GrowableArray<int> arg_order(2 * total_in_args);
2002
2003 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2004 arg_order.push(i);
2005 arg_order.push(c_arg);
2006 }
2007
2008 for (int ai = 0; ai < arg_order.length(); ai += 2) {
2009 int i = arg_order.at(ai);
2010 int c_arg = arg_order.at(ai + 1);
2011 __ block_comment(err_msg("move %d -> %d", i, c_arg));
2012 #ifdef ASSERT
2013 if (in_regs[i].first()->is_Register()) {
2014 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2015 } else if (in_regs[i].first()->is_XMMRegister()) {
2016 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2017 }
2018 if (out_regs[c_arg].first()->is_Register()) {
2019 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2020 } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2021 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2022 }
2023 #endif /* ASSERT */
2024 switch (in_sig_bt[i]) {
2025 case T_ARRAY:
2026 case T_OBJECT:
2027 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2028 ((i == 0) && (!is_static)),
2029 &receiver_offset);
2030 break;
2031 case T_VOID:
2032 break;
2033
2034 case T_FLOAT:
2035 __ float_move(in_regs[i], out_regs[c_arg]);
2036 break;
2037
2038 case T_DOUBLE:
2039 assert( i + 1 < total_in_args &&
2040 in_sig_bt[i + 1] == T_VOID &&
2041 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2042 __ double_move(in_regs[i], out_regs[c_arg]);
2043 break;
2044
2045 case T_LONG :
2046 __ long_move(in_regs[i], out_regs[c_arg]);
2047 break;
2048
2049 case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2050
2051 default:
2052 __ move32_64(in_regs[i], out_regs[c_arg]);
2053 }
2054 }
2055
2056 int c_arg;
2057
2058 // Pre-load a static method's oop into r14. Used both by locking code and
2059 // the normal JNI call code.
2060 // point c_arg at the first arg that is already loaded in case we
2061 // need to spill before we call out
2062 c_arg = total_c_args - total_in_args;
2063
2064 if (method->is_static()) {
2065
2066 // load oop into a register
2067 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2068
2069 // Now handlize the static class mirror it's known not-null.
2070 __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2071 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2072
2073 // Now get the handle
2074 __ lea(oop_handle_reg, Address(rsp, klass_offset));
2075 // store the klass handle as second argument
2076 __ movptr(c_rarg1, oop_handle_reg);
2077 // and protect the arg if we must spill
2078 c_arg--;
2079 }
2080
2081 // Change state to native (we save the return address in the thread, since it might not
2082 // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2083 // points into the right code segment. It does not have to be the correct return pc.
2084 // We use the same pc/oopMap repeatedly when we call out
2085
2086 Label native_return;
2087 if (method->is_object_wait0()) {
2088 // For convenience we use the pc we want to resume to in case of preemption on Object.wait.
2089 __ set_last_Java_frame(rsp, noreg, native_return, rscratch1);
2090 } else {
2091 intptr_t the_pc = (intptr_t) __ pc();
2092 oop_maps->add_gc_map(the_pc - start, map);
2093
2094 __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1);
2095 }
2096
2097 // We have all of the arguments setup at this point. We must not touch any register
2098 // argument registers at this point (what if we save/restore them there are no oop?
2099
2100 if (DTraceMethodProbes) {
2101 // protect the args we've loaded
2102 save_args(masm, total_c_args, c_arg, out_regs);
2103 __ mov_metadata(c_rarg1, method());
2104 __ call_VM_leaf(
2105 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2106 r15_thread, c_rarg1);
2107 restore_args(masm, total_c_args, c_arg, out_regs);
2108 }
2109
2110 // RedefineClasses() tracing support for obsolete method entry
2111 if (log_is_enabled(Trace, redefine, class, obsolete)) {
2112 // protect the args we've loaded
2113 save_args(masm, total_c_args, c_arg, out_regs);
2114 __ mov_metadata(c_rarg1, method());
2115 __ call_VM_leaf(
2116 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2117 r15_thread, c_rarg1);
2118 restore_args(masm, total_c_args, c_arg, out_regs);
2119 }
2120
2121 // Lock a synchronized method
2122
2123 // Register definitions used by locking and unlocking
2124
2125 const Register swap_reg = rax; // Must use rax for cmpxchg instruction
2126 const Register obj_reg = rbx; // Will contain the oop
2127 const Register lock_reg = r13; // Address of compiler lock object (BasicLock)
2128
2129 Label slow_path_lock;
2130 Label lock_done;
2131
2132 if (method->is_synchronized()) {
2133 // Get the handle (the 2nd argument)
2134 __ mov(oop_handle_reg, c_rarg1);
2135
2136 // Get address of the box
2137
2138 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2139
2140 // Load the oop from the handle
2141 __ movptr(obj_reg, Address(oop_handle_reg, 0));
2142
2143 __ fast_lock(lock_reg, obj_reg, swap_reg, rscratch1, slow_path_lock);
2144
2145 // Slow path will re-enter here
2146 __ bind(lock_done);
2147 }
2148
2149 // Finally just about ready to make the JNI call
2150
2151 // get JNIEnv* which is first argument to native
2152 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2153
2154 // Now set thread in native
2155 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2156
2157 __ call(RuntimeAddress(native_func));
2158
2159 // Verify or restore cpu control state after JNI call
2160 __ restore_cpu_control_state_after_jni(rscratch1);
2161
2162 // Unpack native results.
2163 switch (ret_type) {
2164 case T_BOOLEAN: __ c2bool(rax); break;
2165 case T_CHAR : __ movzwl(rax, rax); break;
2166 case T_BYTE : __ sign_extend_byte (rax); break;
2167 case T_SHORT : __ sign_extend_short(rax); break;
2168 case T_INT : /* nothing to do */ break;
2169 case T_DOUBLE :
2170 case T_FLOAT :
2171 // Result is in xmm0 we'll save as needed
2172 break;
2173 case T_ARRAY: // Really a handle
2174 case T_OBJECT: // Really a handle
2175 break; // can't de-handlize until after safepoint check
2176 case T_VOID: break;
2177 case T_LONG: break;
2178 default : ShouldNotReachHere();
2179 }
2180
2181 // Switch thread to "native transition" state before reading the synchronization state.
2182 // This additional state is necessary because reading and testing the synchronization
2183 // state is not atomic w.r.t. GC, as this scenario demonstrates:
2184 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2185 // VM thread changes sync state to synchronizing and suspends threads for GC.
2186 // Thread A is resumed to finish this native method, but doesn't block here since it
2187 // didn't see any synchronization is progress, and escapes.
2188 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2189
2190 // Force this write out before the read below
2191 if (!UseSystemMemoryBarrier) {
2192 __ membar(Assembler::Membar_mask_bits(
2193 Assembler::LoadLoad | Assembler::LoadStore |
2194 Assembler::StoreLoad | Assembler::StoreStore));
2195 }
2196
2197 // check for safepoint operation in progress and/or pending suspend requests
2198 {
2199 Label Continue;
2200 Label slow_path;
2201
2202 __ safepoint_poll(slow_path, true /* at_return */, false /* in_nmethod */);
2203
2204 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2205 __ jcc(Assembler::equal, Continue);
2206 __ bind(slow_path);
2207
2208 // Don't use call_VM as it will see a possible pending exception and forward it
2209 // and never return here preventing us from clearing _last_native_pc down below.
2210 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2211 // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2212 // by hand.
2213 //
2214 __ vzeroupper();
2215 save_native_result(masm, ret_type, stack_slots);
2216 __ mov(c_rarg0, r15_thread);
2217 __ mov(r12, rsp); // remember sp
2218 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2219 __ andptr(rsp, -16); // align stack as required by ABI
2220 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2221 __ mov(rsp, r12); // restore sp
2222 __ reinit_heapbase();
2223 // Restore any method result value
2224 restore_native_result(masm, ret_type, stack_slots);
2225 __ bind(Continue);
2226 }
2227
2228 // change thread state
2229 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2230
2231 if (method->is_object_wait0()) {
2232 // Check preemption for Object.wait()
2233 __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset()));
2234 __ cmpptr(rscratch1, NULL_WORD);
2235 __ jccb(Assembler::equal, native_return);
2236 __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD);
2237 __ jmp(rscratch1);
2238 __ bind(native_return);
2239
2240 intptr_t the_pc = (intptr_t) __ pc();
2241 oop_maps->add_gc_map(the_pc - start, map);
2242 }
2243
2244
2245 Label reguard;
2246 Label reguard_done;
2247 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2248 __ jcc(Assembler::equal, reguard);
2249 __ bind(reguard_done);
2250
2251 // native result if any is live
2252
2253 // Unlock
2254 Label slow_path_unlock;
2255 Label unlock_done;
2256 if (method->is_synchronized()) {
2257
2258 Label fast_done;
2259
2260 // Get locked oop from the handle we passed to jni
2261 __ movptr(obj_reg, Address(oop_handle_reg, 0));
2262
2263 // Must save rax if it is live now because cmpxchg must use it
2264 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2265 save_native_result(masm, ret_type, stack_slots);
2266 }
2267
2268 __ fast_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2269
2270 // slow path re-enters here
2271 __ bind(unlock_done);
2272 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2273 restore_native_result(masm, ret_type, stack_slots);
2274 }
2275
2276 __ bind(fast_done);
2277 }
2278 if (DTraceMethodProbes) {
2279 save_native_result(masm, ret_type, stack_slots);
2280 __ mov_metadata(c_rarg1, method());
2281 __ call_VM_leaf(
2282 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2283 r15_thread, c_rarg1);
2284 restore_native_result(masm, ret_type, stack_slots);
2285 }
2286
2287 __ reset_last_Java_frame(false);
2288
2289 // Unbox oop result, e.g. JNIHandles::resolve value.
2290 if (is_reference_type(ret_type)) {
2291 __ resolve_jobject(rax /* value */,
2292 rcx /* tmp */);
2293 }
2294
2295 if (CheckJNICalls) {
2296 // clear_pending_jni_exception_check
2297 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2298 }
2299
2300 // reset handle block
2301 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2302 __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2303
2304 // pop our frame
2305
2306 __ leave();
2307
2308 #if INCLUDE_JFR
2309 // We need to do a poll test after unwind in case the sampler
2310 // managed to sample the native frame after returning to Java.
2311 Label L_return;
2312 address poll_test_pc = __ pc();
2313 __ relocate(relocInfo::poll_return_type);
2314 __ testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2315 __ jccb(Assembler::zero, L_return);
2316 __ lea(rscratch1, InternalAddress(poll_test_pc));
2317 __ movptr(Address(r15_thread, JavaThread::saved_exception_pc_offset()), rscratch1);
2318 assert(SharedRuntime::polling_page_return_handler_blob() != nullptr,
2319 "polling page return stub not created yet");
2320 address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
2321 __ jump(RuntimeAddress(stub));
2322 __ bind(L_return);
2323 #endif // INCLUDE_JFR
2324
2325 // Any exception pending?
2326 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2327 __ jcc(Assembler::notEqual, exception_pending);
2328
2329 // Return
2330
2331 __ ret(0);
2332
2333 // Unexpected paths are out of line and go here
2334
2335 // forward the exception
2336 __ bind(exception_pending);
2337
2338 // and forward the exception
2339 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2340
2341 // Slow path locking & unlocking
2342 if (method->is_synchronized()) {
2343
2344 // BEGIN Slow path lock
2345 __ bind(slow_path_lock);
2346
2347 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2348 // args are (oop obj, BasicLock* lock, JavaThread* thread)
2349
2350 // protect the args we've loaded
2351 save_args(masm, total_c_args, c_arg, out_regs);
2352
2353 __ mov(c_rarg0, obj_reg);
2354 __ mov(c_rarg1, lock_reg);
2355 __ mov(c_rarg2, r15_thread);
2356
2357 // Not a leaf but we have last_Java_frame setup as we want.
2358 // We don't want to unmount in case of contention since that would complicate preserving
2359 // the arguments that had already been marshalled into the native convention. So we force
2360 // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame())
2361 // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack.
2362 __ push_cont_fastpath();
2363 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2364 __ pop_cont_fastpath();
2365 restore_args(masm, total_c_args, c_arg, out_regs);
2366
2367 #ifdef ASSERT
2368 { Label L;
2369 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2370 __ jcc(Assembler::equal, L);
2371 __ stop("no pending exception allowed on exit from monitorenter");
2372 __ bind(L);
2373 }
2374 #endif
2375 __ jmp(lock_done);
2376
2377 // END Slow path lock
2378
2379 // BEGIN Slow path unlock
2380 __ bind(slow_path_unlock);
2381
2382 // If we haven't already saved the native result we must save it now as xmm registers
2383 // are still exposed.
2384 __ vzeroupper();
2385 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2386 save_native_result(masm, ret_type, stack_slots);
2387 }
2388
2389 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2390
2391 __ mov(c_rarg0, obj_reg);
2392 __ mov(c_rarg2, r15_thread);
2393 __ mov(r12, rsp); // remember sp
2394 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2395 __ andptr(rsp, -16); // align stack as required by ABI
2396
2397 // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2398 // NOTE that obj_reg == rbx currently
2399 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2400 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2401
2402 // args are (oop obj, BasicLock* lock, JavaThread* thread)
2403 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2404 __ mov(rsp, r12); // restore sp
2405 __ reinit_heapbase();
2406 #ifdef ASSERT
2407 {
2408 Label L;
2409 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2410 __ jcc(Assembler::equal, L);
2411 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2412 __ bind(L);
2413 }
2414 #endif /* ASSERT */
2415
2416 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2417
2418 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2419 restore_native_result(masm, ret_type, stack_slots);
2420 }
2421 __ jmp(unlock_done);
2422
2423 // END Slow path unlock
2424
2425 } // synchronized
2426
2427 // SLOW PATH Reguard the stack if needed
2428
2429 __ bind(reguard);
2430 __ vzeroupper();
2431 save_native_result(masm, ret_type, stack_slots);
2432 __ mov(r12, rsp); // remember sp
2433 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2434 __ andptr(rsp, -16); // align stack as required by ABI
2435 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2436 __ mov(rsp, r12); // restore sp
2437 __ reinit_heapbase();
2438 restore_native_result(masm, ret_type, stack_slots);
2439 // and continue
2440 __ jmp(reguard_done);
2441
2442
2443
2444 __ flush();
2445
2446 nmethod *nm = nmethod::new_native_nmethod(method,
2447 compile_id,
2448 masm->code(),
2449 vep_offset,
2450 frame_complete,
2451 stack_slots / VMRegImpl::slots_per_word,
2452 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2453 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2454 oop_maps);
2455
2456 return nm;
2457 }
2458
2459 // this function returns the adjust size (in number of words) to a c2i adapter
2460 // activation for use during deoptimization
2461 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2462 return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2463 }
2464
2465
2466 uint SharedRuntime::out_preserve_stack_slots() {
2467 return 0;
2468 }
2469
2470
2471 // Number of stack slots between incoming argument block and the start of
2472 // a new frame. The PROLOG must add this many slots to the stack. The
2473 // EPILOG must remove this many slots. amd64 needs two slots for
2474 // return address.
2475 uint SharedRuntime::in_preserve_stack_slots() {
2476 return 4 + 2 * VerifyStackAtCalls;
2477 }
2478
2479 VMReg SharedRuntime::thread_register() {
2480 return r15_thread->as_VMReg();
2481 }
2482
2483 //------------------------------generate_deopt_blob----------------------------
2484 void SharedRuntime::generate_deopt_blob() {
2485 // Allocate space for the code
2486 ResourceMark rm;
2487 // Setup code generation tools
2488 int pad = 0;
2489 if (UseAVX > 2) {
2490 pad += 1024;
2491 }
2492 if (UseAPX) {
2493 pad += 1024;
2494 }
2495 #if INCLUDE_JVMCI
2496 if (EnableJVMCI) {
2497 pad += 512; // Increase the buffer size when compiling for JVMCI
2498 }
2499 #endif
2500 const char* name = SharedRuntime::stub_name(StubId::shared_deopt_id);
2501 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
2502 if (blob != nullptr) {
2503 _deopt_blob = blob->as_deoptimization_blob();
2504 return;
2505 }
2506
2507 CodeBuffer buffer(name, 2560+pad, 1024);
2508 MacroAssembler* masm = new MacroAssembler(&buffer);
2509 int frame_size_in_words;
2510 OopMap* map = nullptr;
2511 OopMapSet *oop_maps = new OopMapSet();
2512
2513 // -------------
2514 // This code enters when returning to a de-optimized nmethod. A return
2515 // address has been pushed on the stack, and return values are in
2516 // registers.
2517 // If we are doing a normal deopt then we were called from the patched
2518 // nmethod from the point we returned to the nmethod. So the return
2519 // address on the stack is wrong by NativeCall::instruction_size
2520 // We will adjust the value so it looks like we have the original return
2521 // address on the stack (like when we eagerly deoptimized).
2522 // In the case of an exception pending when deoptimizing, we enter
2523 // with a return address on the stack that points after the call we patched
2524 // into the exception handler. We have the following register state from,
2525 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2526 // rax: exception oop
2527 // rbx: exception handler
2528 // rdx: throwing pc
2529 // So in this case we simply jam rdx into the useless return address and
2530 // the stack looks just like we want.
2531 //
2532 // At this point we need to de-opt. We save the argument return
2533 // registers. We call the first C routine, fetch_unroll_info(). This
2534 // routine captures the return values and returns a structure which
2535 // describes the current frame size and the sizes of all replacement frames.
2536 // The current frame is compiled code and may contain many inlined
2537 // functions, each with their own JVM state. We pop the current frame, then
2538 // push all the new frames. Then we call the C routine unpack_frames() to
2539 // populate these frames. Finally unpack_frames() returns us the new target
2540 // address. Notice that callee-save registers are BLOWN here; they have
2541 // already been captured in the vframeArray at the time the return PC was
2542 // patched.
2543 address start = __ pc();
2544 Label cont;
2545
2546 // Prolog for non exception case!
2547
2548 // Save everything in sight.
2549 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2550
2551 // Normal deoptimization. Save exec mode for unpack_frames.
2552 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2553 __ jmp(cont);
2554
2555 int reexecute_offset = __ pc() - start;
2556 #if INCLUDE_JVMCI && !defined(COMPILER1)
2557 if (UseJVMCICompiler) {
2558 // JVMCI does not use this kind of deoptimization
2559 __ should_not_reach_here();
2560 }
2561 #endif
2562
2563 // Reexecute case
2564 // return address is the pc describes what bci to do re-execute at
2565
2566 // No need to update map as each call to save_live_registers will produce identical oopmap
2567 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2568
2569 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2570 __ jmp(cont);
2571
2572 #if INCLUDE_JVMCI
2573 Label after_fetch_unroll_info_call;
2574 int implicit_exception_uncommon_trap_offset = 0;
2575 int uncommon_trap_offset = 0;
2576
2577 if (EnableJVMCI) {
2578 implicit_exception_uncommon_trap_offset = __ pc() - start;
2579
2580 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2581 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2582
2583 uncommon_trap_offset = __ pc() - start;
2584
2585 // Save everything in sight.
2586 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2587 // fetch_unroll_info needs to call last_java_frame()
2588 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2589
2590 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2591 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2592
2593 __ movl(r14, Deoptimization::Unpack_reexecute);
2594 __ mov(c_rarg0, r15_thread);
2595 __ movl(c_rarg2, r14); // exec mode
2596 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2597 oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2598
2599 __ reset_last_Java_frame(false);
2600
2601 __ jmp(after_fetch_unroll_info_call);
2602 } // EnableJVMCI
2603 #endif // INCLUDE_JVMCI
2604
2605 int exception_offset = __ pc() - start;
2606
2607 // Prolog for exception case
2608
2609 // all registers are dead at this entry point, except for rax, and
2610 // rdx which contain the exception oop and exception pc
2611 // respectively. Set them in TLS and fall thru to the
2612 // unpack_with_exception_in_tls entry point.
2613
2614 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2615 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2616
2617 int exception_in_tls_offset = __ pc() - start;
2618
2619 // new implementation because exception oop is now passed in JavaThread
2620
2621 // Prolog for exception case
2622 // All registers must be preserved because they might be used by LinearScan
2623 // Exceptiop oop and throwing PC are passed in JavaThread
2624 // tos: stack at point of call to method that threw the exception (i.e. only
2625 // args are on the stack, no return address)
2626
2627 // make room on stack for the return address
2628 // It will be patched later with the throwing pc. The correct value is not
2629 // available now because loading it from memory would destroy registers.
2630 __ push(0);
2631
2632 // Save everything in sight.
2633 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2634
2635 // Now it is safe to overwrite any register
2636
2637 // Deopt during an exception. Save exec mode for unpack_frames.
2638 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2639
2640 // load throwing pc from JavaThread and patch it as the return address
2641 // of the current frame. Then clear the field in JavaThread
2642
2643 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2644 __ movptr(Address(rbp, wordSize), rdx);
2645 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2646
2647 #ifdef ASSERT
2648 // verify that there is really an exception oop in JavaThread
2649 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2650 __ verify_oop(rax);
2651
2652 // verify that there is no pending exception
2653 Label no_pending_exception;
2654 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2655 __ testptr(rax, rax);
2656 __ jcc(Assembler::zero, no_pending_exception);
2657 __ stop("must not have pending exception here");
2658 __ bind(no_pending_exception);
2659 #endif
2660
2661 __ bind(cont);
2662
2663 // Call C code. Need thread and this frame, but NOT official VM entry
2664 // crud. We cannot block on this call, no GC can happen.
2665 //
2666 // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2667
2668 // fetch_unroll_info needs to call last_java_frame().
2669
2670 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2671 #ifdef ASSERT
2672 { Label L;
2673 __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2674 __ jcc(Assembler::equal, L);
2675 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2676 __ bind(L);
2677 }
2678 #endif // ASSERT
2679 __ mov(c_rarg0, r15_thread);
2680 __ movl(c_rarg1, r14); // exec_mode
2681 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2682
2683 // Need to have an oopmap that tells fetch_unroll_info where to
2684 // find any register it might need.
2685 oop_maps->add_gc_map(__ pc() - start, map);
2686
2687 __ reset_last_Java_frame(false);
2688
2689 #if INCLUDE_JVMCI
2690 if (EnableJVMCI) {
2691 __ bind(after_fetch_unroll_info_call);
2692 }
2693 #endif
2694
2695 // Load UnrollBlock* into rdi
2696 __ mov(rdi, rax);
2697
2698 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2699 Label noException;
2700 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending?
2701 __ jcc(Assembler::notEqual, noException);
2702 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2703 // QQQ this is useless it was null above
2704 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2705 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2706 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2707
2708 __ verify_oop(rax);
2709
2710 // Overwrite the result registers with the exception results.
2711 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2712 // I think this is useless
2713 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2714
2715 __ bind(noException);
2716
2717 // Only register save data is on the stack.
2718 // Now restore the result registers. Everything else is either dead
2719 // or captured in the vframeArray.
2720 RegisterSaver::restore_result_registers(masm);
2721
2722 // All of the register save area has been popped of the stack. Only the
2723 // return address remains.
2724
2725 // Pop all the frames we must move/replace.
2726 //
2727 // Frame picture (youngest to oldest)
2728 // 1: self-frame (no frame link)
2729 // 2: deopting frame (no frame link)
2730 // 3: caller of deopting frame (could be compiled/interpreted).
2731 //
2732 // Note: by leaving the return address of self-frame on the stack
2733 // and using the size of frame 2 to adjust the stack
2734 // when we are done the return to frame 3 will still be on the stack.
2735
2736 // Pop deoptimized frame
2737 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2738 __ addptr(rsp, rcx);
2739
2740 // rsp should be pointing at the return address to the caller (3)
2741
2742 // Pick up the initial fp we should save
2743 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2744 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2745
2746 #ifdef ASSERT
2747 // Compilers generate code that bang the stack by as much as the
2748 // interpreter would need. So this stack banging should never
2749 // trigger a fault. Verify that it does not on non product builds.
2750 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2751 __ bang_stack_size(rbx, rcx);
2752 #endif
2753
2754 // Load address of array of frame pcs into rcx
2755 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2756
2757 // Trash the old pc
2758 __ addptr(rsp, wordSize);
2759
2760 // Load address of array of frame sizes into rsi
2761 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2762
2763 // Load counter into rdx
2764 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2765
2766 // Now adjust the caller's stack to make up for the extra locals
2767 // but record the original sp so that we can save it in the skeletal interpreter
2768 // frame and the stack walking of interpreter_sender will get the unextended sp
2769 // value and not the "real" sp value.
2770
2771 const Register sender_sp = r8;
2772
2773 __ mov(sender_sp, rsp);
2774 __ movl(rbx, Address(rdi,
2775 Deoptimization::UnrollBlock::
2776 caller_adjustment_offset()));
2777 __ subptr(rsp, rbx);
2778
2779 // Push interpreter frames in a loop
2780 Label loop;
2781 __ bind(loop);
2782 __ movptr(rbx, Address(rsi, 0)); // Load frame size
2783 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand
2784 __ pushptr(Address(rcx, 0)); // Save return address
2785 __ enter(); // Save old & set new ebp
2786 __ subptr(rsp, rbx); // Prolog
2787 // This value is corrected by layout_activation_impl
2788 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2789 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2790 __ mov(sender_sp, rsp); // Pass sender_sp to next frame
2791 __ addptr(rsi, wordSize); // Bump array pointer (sizes)
2792 __ addptr(rcx, wordSize); // Bump array pointer (pcs)
2793 __ decrementl(rdx); // Decrement counter
2794 __ jcc(Assembler::notZero, loop);
2795 __ pushptr(Address(rcx, 0)); // Save final return address
2796
2797 // Re-push self-frame
2798 __ enter(); // Save old & set new ebp
2799
2800 // Allocate a full sized register save area.
2801 // Return address and rbp are in place, so we allocate two less words.
2802 __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2803
2804 // Restore frame locals after moving the frame
2805 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2806 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2807
2808 // Call C code. Need thread but NOT official VM entry
2809 // crud. We cannot block on this call, no GC can happen. Call should
2810 // restore return values to their stack-slots with the new SP.
2811 //
2812 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2813
2814 // Use rbp because the frames look interpreted now
2815 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2816 // Don't need the precise return PC here, just precise enough to point into this code blob.
2817 address the_pc = __ pc();
2818 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2819
2820 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI
2821 __ mov(c_rarg0, r15_thread);
2822 __ movl(c_rarg1, r14); // second arg: exec_mode
2823 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2824 // Revert SP alignment after call since we're going to do some SP relative addressing below
2825 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2826
2827 // Set an oopmap for the call site
2828 // Use the same PC we used for the last java frame
2829 oop_maps->add_gc_map(the_pc - start,
2830 new OopMap( frame_size_in_words, 0 ));
2831
2832 // Clear fp AND pc
2833 __ reset_last_Java_frame(true);
2834
2835 // Collect return values
2836 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2837 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2838 // I think this is useless (throwing pc?)
2839 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2840
2841 // Pop self-frame.
2842 __ leave(); // Epilog
2843
2844 // Jump to interpreter
2845 __ ret(0);
2846
2847 // Make sure all code is generated
2848 masm->flush();
2849
2850 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2851 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2852 #if INCLUDE_JVMCI
2853 if (EnableJVMCI) {
2854 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2855 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2856 }
2857 #endif
2858
2859 AOTCodeCache::store_code_blob(*_deopt_blob, AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
2860 }
2861
2862 //------------------------------generate_handler_blob------
2863 //
2864 // Generate a special Compile2Runtime blob that saves all registers,
2865 // and setup oopmap.
2866 //
2867 SafepointBlob* SharedRuntime::generate_handler_blob(StubId id, address call_ptr) {
2868 assert(StubRoutines::forward_exception_entry() != nullptr,
2869 "must be generated before");
2870 assert(is_polling_page_id(id), "expected a polling page stub id");
2871
2872 // Allocate space for the code. Setup code generation tools.
2873 const char* name = SharedRuntime::stub_name(id);
2874 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
2875 if (blob != nullptr) {
2876 return blob->as_safepoint_blob();
2877 }
2878
2879 ResourceMark rm;
2880 OopMapSet *oop_maps = new OopMapSet();
2881 OopMap* map;
2882 CodeBuffer buffer(name, 2548, 1024);
2883 MacroAssembler* masm = new MacroAssembler(&buffer);
2884
2885 address start = __ pc();
2886 address call_pc = nullptr;
2887 int frame_size_in_words;
2888 bool cause_return = (id == StubId::shared_polling_page_return_handler_id);
2889 bool save_wide_vectors = (id == StubId::shared_polling_page_vectors_safepoint_handler_id);
2890
2891 // Make room for return address (or push it again)
2892 if (!cause_return) {
2893 __ push(rbx);
2894 }
2895
2896 // Save registers, fpu state, and flags
2897 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
2898
2899 // The following is basically a call_VM. However, we need the precise
2900 // address of the call in order to generate an oopmap. Hence, we do all the
2901 // work ourselves.
2902
2903 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
2904
2905 // The return address must always be correct so that frame constructor never
2906 // sees an invalid pc.
2907
2908 if (!cause_return) {
2909 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
2910 // Additionally, rbx is a callee saved register and we can look at it later to determine
2911 // if someone changed the return address for us!
2912 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
2913 __ movptr(Address(rbp, wordSize), rbx);
2914 }
2915
2916 // Do the call
2917 __ mov(c_rarg0, r15_thread);
2918 __ call(RuntimeAddress(call_ptr));
2919
2920 // Set an oopmap for the call site. This oopmap will map all
2921 // oop-registers and debug-info registers as callee-saved. This
2922 // will allow deoptimization at this safepoint to find all possible
2923 // debug-info recordings, as well as let GC find all oops.
2924
2925 oop_maps->add_gc_map( __ pc() - start, map);
2926
2927 Label noException;
2928
2929 __ reset_last_Java_frame(false);
2930
2931 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
2932 __ jcc(Assembler::equal, noException);
2933
2934 // Exception pending
2935
2936 RegisterSaver::restore_live_registers(masm, save_wide_vectors);
2937
2938 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2939
2940 // No exception case
2941 __ bind(noException);
2942
2943 Label no_adjust;
2944 #ifdef ASSERT
2945 Label bail;
2946 #endif
2947 if (!cause_return) {
2948 Label no_prefix, not_special, check_rex_prefix;
2949
2950 // If our stashed return pc was modified by the runtime we avoid touching it
2951 __ cmpptr(rbx, Address(rbp, wordSize));
2952 __ jcc(Assembler::notEqual, no_adjust);
2953
2954 // Skip over the poll instruction.
2955 // See NativeInstruction::is_safepoint_poll()
2956 // Possible encodings:
2957 // 85 00 test %eax,(%rax)
2958 // 85 01 test %eax,(%rcx)
2959 // 85 02 test %eax,(%rdx)
2960 // 85 03 test %eax,(%rbx)
2961 // 85 06 test %eax,(%rsi)
2962 // 85 07 test %eax,(%rdi)
2963 //
2964 // 41 85 00 test %eax,(%r8)
2965 // 41 85 01 test %eax,(%r9)
2966 // 41 85 02 test %eax,(%r10)
2967 // 41 85 03 test %eax,(%r11)
2968 // 41 85 06 test %eax,(%r14)
2969 // 41 85 07 test %eax,(%r15)
2970 //
2971 // 85 04 24 test %eax,(%rsp)
2972 // 41 85 04 24 test %eax,(%r12)
2973 // 85 45 00 test %eax,0x0(%rbp)
2974 // 41 85 45 00 test %eax,0x0(%r13)
2975 //
2976 // Notes:
2977 // Format of legacy MAP0 test instruction:-
2978 // [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32]
2979 // o For safepoint polling instruction "test %eax,(%rax)", encoding of first register
2980 // operand and base register of memory operand is b/w [0-8), hence we do not require
2981 // additional REX prefix where REX.B bit stores MSB bit of register encoding, which
2982 // is why two bytes encoding is sufficient here.
2983 // o For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE
2984 // register of memory operand is 1000, thus we need additional REX prefix in this case,
2985 // there by adding additional byte to instruction encoding.
2986 // o In case BASE register is one of the 32 extended GPR registers available only on targets
2987 // supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold
2988 // most significant two bits of 5 bit register encoding.
2989
2990 if (VM_Version::supports_apx_f()) {
2991 __ cmpb(Address(rbx, 0), Assembler::REX2);
2992 __ jccb(Assembler::notEqual, check_rex_prefix);
2993 __ addptr(rbx, 2);
2994 __ bind(check_rex_prefix);
2995 }
2996 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
2997 __ jccb(Assembler::notEqual, no_prefix);
2998 __ addptr(rbx, 1);
2999 __ bind(no_prefix);
3000 #ifdef ASSERT
3001 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3002 #endif
3003 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3004 // r12/rsp 0x04
3005 // r13/rbp 0x05
3006 __ movzbq(rcx, Address(rbx, 1));
3007 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3008 __ subptr(rcx, 4); // looking for 0x00 .. 0x01
3009 __ cmpptr(rcx, 1);
3010 __ jccb(Assembler::above, not_special);
3011 __ addptr(rbx, 1);
3012 __ bind(not_special);
3013 #ifdef ASSERT
3014 // Verify the correct encoding of the poll we're about to skip.
3015 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3016 __ jcc(Assembler::notEqual, bail);
3017 // Mask out the modrm bits
3018 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3019 // rax encodes to 0, so if the bits are nonzero it's incorrect
3020 __ jcc(Assembler::notZero, bail);
3021 #endif
3022 // Adjust return pc forward to step over the safepoint poll instruction
3023 __ addptr(rbx, 2);
3024 __ movptr(Address(rbp, wordSize), rbx);
3025 }
3026
3027 __ bind(no_adjust);
3028 // Normal exit, restore registers and exit.
3029 RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3030 __ ret(0);
3031
3032 #ifdef ASSERT
3033 __ bind(bail);
3034 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3035 #endif
3036
3037 // Make sure all code is generated
3038 masm->flush();
3039
3040 // Fill-out other meta info
3041 SafepointBlob* sp_blob = SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3042
3043 AOTCodeCache::store_code_blob(*sp_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3044 return sp_blob;
3045 }
3046
3047 //
3048 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3049 //
3050 // Generate a stub that calls into vm to find out the proper destination
3051 // of a java call. All the argument registers are live at this point
3052 // but since this is generic code we don't know what they are and the caller
3053 // must do any gc of the args.
3054 //
3055 RuntimeStub* SharedRuntime::generate_resolve_blob(StubId id, address destination) {
3056 assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3057 assert(is_resolve_id(id), "expected a resolve stub id");
3058
3059 const char* name = SharedRuntime::stub_name(id);
3060 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3061 if (blob != nullptr) {
3062 return blob->as_runtime_stub();
3063 }
3064
3065 // allocate space for the code
3066 ResourceMark rm;
3067 CodeBuffer buffer(name, 1552, 512);
3068 MacroAssembler* masm = new MacroAssembler(&buffer);
3069
3070 int frame_size_in_words;
3071
3072 OopMapSet *oop_maps = new OopMapSet();
3073 OopMap* map = nullptr;
3074
3075 int start = __ offset();
3076
3077 // No need to save vector registers since they are caller-saved anyway.
3078 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3079
3080 int frame_complete = __ offset();
3081
3082 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3083
3084 __ mov(c_rarg0, r15_thread);
3085
3086 __ call(RuntimeAddress(destination));
3087
3088
3089 // Set an oopmap for the call site.
3090 // We need this not only for callee-saved registers, but also for volatile
3091 // registers that the compiler might be keeping live across a safepoint.
3092
3093 oop_maps->add_gc_map( __ offset() - start, map);
3094
3095 // rax contains the address we are going to jump to assuming no exception got installed
3096
3097 // clear last_Java_sp
3098 __ reset_last_Java_frame(false);
3099 // check for pending exceptions
3100 Label pending;
3101 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3102 __ jcc(Assembler::notEqual, pending);
3103
3104 // get the returned Method*
3105 __ get_vm_result_metadata(rbx);
3106 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3107
3108 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3109
3110 RegisterSaver::restore_live_registers(masm);
3111
3112 // We are back to the original state on entry and ready to go.
3113
3114 __ jmp(rax);
3115
3116 // Pending exception after the safepoint
3117
3118 __ bind(pending);
3119
3120 RegisterSaver::restore_live_registers(masm);
3121
3122 // exception pending => remove activation and forward to exception handler
3123
3124 __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
3125
3126 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3127 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3128
3129 // -------------
3130 // make sure all code is generated
3131 masm->flush();
3132
3133 // return the blob
3134 // frame_size_words or bytes??
3135 RuntimeStub* rs_blob = RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3136
3137 AOTCodeCache::store_code_blob(*rs_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3138 return rs_blob;
3139 }
3140
3141 // Continuation point for throwing of implicit exceptions that are
3142 // not handled in the current activation. Fabricates an exception
3143 // oop and initiates normal exception dispatching in this
3144 // frame. Since we need to preserve callee-saved values (currently
3145 // only for C2, but done for C1 as well) we need a callee-saved oop
3146 // map and therefore have to make these stubs into RuntimeStubs
3147 // rather than BufferBlobs. If the compiler needs all registers to
3148 // be preserved between the fault point and the exception handler
3149 // then it must assume responsibility for that in
3150 // AbstractCompiler::continuation_for_implicit_null_exception or
3151 // continuation_for_implicit_division_by_zero_exception. All other
3152 // implicit exceptions (e.g., NullPointerException or
3153 // AbstractMethodError on entry) are either at call sites or
3154 // otherwise assume that stack unwinding will be initiated, so
3155 // caller saved registers were assumed volatile in the compiler.
3156 RuntimeStub* SharedRuntime::generate_throw_exception(StubId id, address runtime_entry) {
3157 assert(is_throw_id(id), "expected a throw stub id");
3158
3159 const char* name = SharedRuntime::stub_name(id);
3160
3161 // Information about frame layout at time of blocking runtime call.
3162 // Note that we only have to preserve callee-saved registers since
3163 // the compilers are responsible for supplying a continuation point
3164 // if they expect all registers to be preserved.
3165 enum layout {
3166 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3167 rbp_off2,
3168 return_off,
3169 return_off2,
3170 framesize // inclusive of return address
3171 };
3172
3173 int insts_size = 512;
3174 int locs_size = 64;
3175
3176 const char* timer_msg = "SharedRuntime generate_throw_exception";
3177 TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3178
3179 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3180 if (blob != nullptr) {
3181 return blob->as_runtime_stub();
3182 }
3183
3184 ResourceMark rm;
3185 CodeBuffer code(name, insts_size, locs_size);
3186 OopMapSet* oop_maps = new OopMapSet();
3187 MacroAssembler* masm = new MacroAssembler(&code);
3188
3189 address start = __ pc();
3190
3191 // This is an inlined and slightly modified version of call_VM
3192 // which has the ability to fetch the return PC out of
3193 // thread-local storage and also sets up last_Java_sp slightly
3194 // differently than the real call_VM
3195
3196 __ enter(); // required for proper stackwalking of RuntimeStub frame
3197
3198 assert(is_even(framesize/2), "sp not 16-byte aligned");
3199
3200 // return address and rbp are already in place
3201 __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3202
3203 int frame_complete = __ pc() - start;
3204
3205 // Set up last_Java_sp and last_Java_fp
3206 address the_pc = __ pc();
3207 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3208 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack
3209
3210 // Call runtime
3211 __ movptr(c_rarg0, r15_thread);
3212 BLOCK_COMMENT("call runtime_entry");
3213 __ call(RuntimeAddress(runtime_entry));
3214
3215 // Generate oop map
3216 OopMap* map = new OopMap(framesize, 0);
3217
3218 oop_maps->add_gc_map(the_pc - start, map);
3219
3220 __ reset_last_Java_frame(true);
3221
3222 __ leave(); // required for proper stackwalking of RuntimeStub frame
3223
3224 // check for pending exceptions
3225 #ifdef ASSERT
3226 Label L;
3227 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3228 __ jcc(Assembler::notEqual, L);
3229 __ should_not_reach_here();
3230 __ bind(L);
3231 #endif // ASSERT
3232 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3233
3234
3235 // codeBlob framesize is in words (not VMRegImpl::slot_size)
3236 RuntimeStub* stub =
3237 RuntimeStub::new_runtime_stub(name,
3238 &code,
3239 frame_complete,
3240 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3241 oop_maps, false);
3242 AOTCodeCache::store_code_blob(*stub, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3243
3244 return stub;
3245 }
3246
3247 //------------------------------Montgomery multiplication------------------------
3248 //
3249
3250 #ifndef _WINDOWS
3251
3252 // Subtract 0:b from carry:a. Return carry.
3253 static julong
3254 sub(julong a[], julong b[], julong carry, long len) {
3255 long long i = 0, cnt = len;
3256 julong tmp;
3257 asm volatile("clc; "
3258 "0: ; "
3259 "mov (%[b], %[i], 8), %[tmp]; "
3260 "sbb %[tmp], (%[a], %[i], 8); "
3261 "inc %[i]; dec %[cnt]; "
3262 "jne 0b; "
3263 "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3264 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3265 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3266 : "memory");
3267 return tmp;
3268 }
3269
3270 // Multiply (unsigned) Long A by Long B, accumulating the double-
3271 // length result into the accumulator formed of T0, T1, and T2.
3272 #define MACC(A, B, T0, T1, T2) \
3273 do { \
3274 unsigned long hi, lo; \
3275 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
3276 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
3277 : "r"(A), "a"(B) : "cc"); \
3278 } while(0)
3279
3280 // As above, but add twice the double-length result into the
3281 // accumulator.
3282 #define MACC2(A, B, T0, T1, T2) \
3283 do { \
3284 unsigned long hi, lo; \
3285 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3286 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
3287 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
3288 : "r"(A), "a"(B) : "cc"); \
3289 } while(0)
3290
3291 #else //_WINDOWS
3292
3293 static julong
3294 sub(julong a[], julong b[], julong carry, long len) {
3295 long i;
3296 julong tmp;
3297 unsigned char c = 1;
3298 for (i = 0; i < len; i++) {
3299 c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3300 a[i] = tmp;
3301 }
3302 c = _addcarry_u64(c, carry, ~0, &tmp);
3303 return tmp;
3304 }
3305
3306 // Multiply (unsigned) Long A by Long B, accumulating the double-
3307 // length result into the accumulator formed of T0, T1, and T2.
3308 #define MACC(A, B, T0, T1, T2) \
3309 do { \
3310 julong hi, lo; \
3311 lo = _umul128(A, B, &hi); \
3312 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
3313 c = _addcarry_u64(c, hi, T1, &T1); \
3314 _addcarry_u64(c, T2, 0, &T2); \
3315 } while(0)
3316
3317 // As above, but add twice the double-length result into the
3318 // accumulator.
3319 #define MACC2(A, B, T0, T1, T2) \
3320 do { \
3321 julong hi, lo; \
3322 lo = _umul128(A, B, &hi); \
3323 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
3324 c = _addcarry_u64(c, hi, T1, &T1); \
3325 _addcarry_u64(c, T2, 0, &T2); \
3326 c = _addcarry_u64(0, lo, T0, &T0); \
3327 c = _addcarry_u64(c, hi, T1, &T1); \
3328 _addcarry_u64(c, T2, 0, &T2); \
3329 } while(0)
3330
3331 #endif //_WINDOWS
3332
3333 // Fast Montgomery multiplication. The derivation of the algorithm is
3334 // in A Cryptographic Library for the Motorola DSP56000,
3335 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3336
3337 static void NOINLINE
3338 montgomery_multiply(julong a[], julong b[], julong n[],
3339 julong m[], julong inv, int len) {
3340 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3341 int i;
3342
3343 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3344
3345 for (i = 0; i < len; i++) {
3346 int j;
3347 for (j = 0; j < i; j++) {
3348 MACC(a[j], b[i-j], t0, t1, t2);
3349 MACC(m[j], n[i-j], t0, t1, t2);
3350 }
3351 MACC(a[i], b[0], t0, t1, t2);
3352 m[i] = t0 * inv;
3353 MACC(m[i], n[0], t0, t1, t2);
3354
3355 assert(t0 == 0, "broken Montgomery multiply");
3356
3357 t0 = t1; t1 = t2; t2 = 0;
3358 }
3359
3360 for (i = len; i < 2*len; i++) {
3361 int j;
3362 for (j = i-len+1; j < len; j++) {
3363 MACC(a[j], b[i-j], t0, t1, t2);
3364 MACC(m[j], n[i-j], t0, t1, t2);
3365 }
3366 m[i-len] = t0;
3367 t0 = t1; t1 = t2; t2 = 0;
3368 }
3369
3370 while (t0)
3371 t0 = sub(m, n, t0, len);
3372 }
3373
3374 // Fast Montgomery squaring. This uses asymptotically 25% fewer
3375 // multiplies so it should be up to 25% faster than Montgomery
3376 // multiplication. However, its loop control is more complex and it
3377 // may actually run slower on some machines.
3378
3379 static void NOINLINE
3380 montgomery_square(julong a[], julong n[],
3381 julong m[], julong inv, int len) {
3382 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3383 int i;
3384
3385 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3386
3387 for (i = 0; i < len; i++) {
3388 int j;
3389 int end = (i+1)/2;
3390 for (j = 0; j < end; j++) {
3391 MACC2(a[j], a[i-j], t0, t1, t2);
3392 MACC(m[j], n[i-j], t0, t1, t2);
3393 }
3394 if ((i & 1) == 0) {
3395 MACC(a[j], a[j], t0, t1, t2);
3396 }
3397 for (; j < i; j++) {
3398 MACC(m[j], n[i-j], t0, t1, t2);
3399 }
3400 m[i] = t0 * inv;
3401 MACC(m[i], n[0], t0, t1, t2);
3402
3403 assert(t0 == 0, "broken Montgomery square");
3404
3405 t0 = t1; t1 = t2; t2 = 0;
3406 }
3407
3408 for (i = len; i < 2*len; i++) {
3409 int start = i-len+1;
3410 int end = start + (len - start)/2;
3411 int j;
3412 for (j = start; j < end; j++) {
3413 MACC2(a[j], a[i-j], t0, t1, t2);
3414 MACC(m[j], n[i-j], t0, t1, t2);
3415 }
3416 if ((i & 1) == 0) {
3417 MACC(a[j], a[j], t0, t1, t2);
3418 }
3419 for (; j < len; j++) {
3420 MACC(m[j], n[i-j], t0, t1, t2);
3421 }
3422 m[i-len] = t0;
3423 t0 = t1; t1 = t2; t2 = 0;
3424 }
3425
3426 while (t0)
3427 t0 = sub(m, n, t0, len);
3428 }
3429
3430 // Swap words in a longword.
3431 static julong swap(julong x) {
3432 return (x << 32) | (x >> 32);
3433 }
3434
3435 // Copy len longwords from s to d, word-swapping as we go. The
3436 // destination array is reversed.
3437 static void reverse_words(julong *s, julong *d, int len) {
3438 d += len;
3439 while(len-- > 0) {
3440 d--;
3441 *d = swap(*s);
3442 s++;
3443 }
3444 }
3445
3446 // The threshold at which squaring is advantageous was determined
3447 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3448 #define MONTGOMERY_SQUARING_THRESHOLD 64
3449
3450 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3451 jint len, jlong inv,
3452 jint *m_ints) {
3453 assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3454 int longwords = len/2;
3455
3456 // Make very sure we don't use so much space that the stack might
3457 // overflow. 512 jints corresponds to an 16384-bit integer and
3458 // will use here a total of 8k bytes of stack space.
3459 int divisor = sizeof(julong) * 4;
3460 guarantee(longwords <= 8192 / divisor, "must be");
3461 int total_allocation = longwords * sizeof (julong) * 4;
3462 julong *scratch = (julong *)alloca(total_allocation);
3463
3464 // Local scratch arrays
3465 julong
3466 *a = scratch + 0 * longwords,
3467 *b = scratch + 1 * longwords,
3468 *n = scratch + 2 * longwords,
3469 *m = scratch + 3 * longwords;
3470
3471 reverse_words((julong *)a_ints, a, longwords);
3472 reverse_words((julong *)b_ints, b, longwords);
3473 reverse_words((julong *)n_ints, n, longwords);
3474
3475 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3476
3477 reverse_words(m, (julong *)m_ints, longwords);
3478 }
3479
3480 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3481 jint len, jlong inv,
3482 jint *m_ints) {
3483 assert(len % 2 == 0, "array length in montgomery_square must be even");
3484 int longwords = len/2;
3485
3486 // Make very sure we don't use so much space that the stack might
3487 // overflow. 512 jints corresponds to an 16384-bit integer and
3488 // will use here a total of 6k bytes of stack space.
3489 int divisor = sizeof(julong) * 3;
3490 guarantee(longwords <= (8192 / divisor), "must be");
3491 int total_allocation = longwords * sizeof (julong) * 3;
3492 julong *scratch = (julong *)alloca(total_allocation);
3493
3494 // Local scratch arrays
3495 julong
3496 *a = scratch + 0 * longwords,
3497 *n = scratch + 1 * longwords,
3498 *m = scratch + 2 * longwords;
3499
3500 reverse_words((julong *)a_ints, a, longwords);
3501 reverse_words((julong *)n_ints, n, longwords);
3502
3503 if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3504 ::montgomery_square(a, n, m, (julong)inv, longwords);
3505 } else {
3506 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3507 }
3508
3509 reverse_words(m, (julong *)m_ints, longwords);
3510 }
3511
3512 #if INCLUDE_JFR
3513
3514 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
3515 // It returns a jobject handle to the event writer.
3516 // The handle is dereferenced and the return value is the event writer oop.
3517 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
3518 enum layout {
3519 rbp_off,
3520 rbpH_off,
3521 return_off,
3522 return_off2,
3523 framesize // inclusive of return address
3524 };
3525
3526 const char* name = SharedRuntime::stub_name(StubId::shared_jfr_write_checkpoint_id);
3527 CodeBuffer code(name, 1024, 64);
3528 MacroAssembler* masm = new MacroAssembler(&code);
3529 address start = __ pc();
3530
3531 __ enter();
3532 address the_pc = __ pc();
3533
3534 int frame_complete = the_pc - start;
3535
3536 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3537 __ movptr(c_rarg0, r15_thread);
3538 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
3539 __ reset_last_Java_frame(true);
3540
3541 // rax is jobject handle result, unpack and process it through a barrier.
3542 __ resolve_global_jobject(rax, c_rarg0);
3543
3544 __ leave();
3545 __ ret(0);
3546
3547 OopMapSet* oop_maps = new OopMapSet();
3548 OopMap* map = new OopMap(framesize, 1);
3549 oop_maps->add_gc_map(frame_complete, map);
3550
3551 RuntimeStub* stub =
3552 RuntimeStub::new_runtime_stub(name,
3553 &code,
3554 frame_complete,
3555 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3556 oop_maps,
3557 false);
3558 return stub;
3559 }
3560
3561 // For c2: call to return a leased buffer.
3562 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
3563 enum layout {
3564 rbp_off,
3565 rbpH_off,
3566 return_off,
3567 return_off2,
3568 framesize // inclusive of return address
3569 };
3570
3571 const char* name = SharedRuntime::stub_name(StubId::shared_jfr_return_lease_id);
3572 CodeBuffer code(name, 1024, 64);
3573 MacroAssembler* masm = new MacroAssembler(&code);
3574 address start = __ pc();
3575
3576 __ enter();
3577 address the_pc = __ pc();
3578
3579 int frame_complete = the_pc - start;
3580
3581 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
3582 __ movptr(c_rarg0, r15_thread);
3583 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
3584 __ reset_last_Java_frame(true);
3585
3586 __ leave();
3587 __ ret(0);
3588
3589 OopMapSet* oop_maps = new OopMapSet();
3590 OopMap* map = new OopMap(framesize, 1);
3591 oop_maps->add_gc_map(frame_complete, map);
3592
3593 RuntimeStub* stub =
3594 RuntimeStub::new_runtime_stub(name,
3595 &code,
3596 frame_complete,
3597 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3598 oop_maps,
3599 false);
3600 return stub;
3601 }
3602
3603 #endif // INCLUDE_JFR