1 /*
2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #ifndef _WINDOWS
26 #include "alloca.h"
27 #endif
28 #include "asm/macroAssembler.hpp"
29 #include "asm/macroAssembler.inline.hpp"
30 #include "code/aotCodeCache.hpp"
31 #include "code/compiledIC.hpp"
32 #include "code/debugInfoRec.hpp"
33 #include "code/nativeInst.hpp"
34 #include "code/vtableStubs.hpp"
35 #include "compiler/oopMap.hpp"
36 #include "gc/shared/collectedHeap.hpp"
37 #include "gc/shared/gcLocker.hpp"
38 #include "gc/shared/barrierSet.hpp"
39 #include "gc/shared/barrierSetAssembler.hpp"
40 #include "interpreter/interpreter.hpp"
41 #include "logging/log.hpp"
42 #include "memory/resourceArea.hpp"
43 #include "memory/universe.hpp"
44 #include "oops/klass.inline.hpp"
45 #include "oops/method.inline.hpp"
46 #include "prims/methodHandles.hpp"
47 #include "runtime/continuation.hpp"
48 #include "runtime/continuationEntry.inline.hpp"
49 #include "runtime/globals.hpp"
50 #include "runtime/jniHandles.hpp"
51 #include "runtime/safepointMechanism.hpp"
52 #include "runtime/sharedRuntime.hpp"
53 #include "runtime/signature.hpp"
54 #include "runtime/stubRoutines.hpp"
55 #include "runtime/timerTrace.hpp"
56 #include "runtime/vframeArray.hpp"
57 #include "runtime/vm_version.hpp"
58 #include "utilities/align.hpp"
59 #include "utilities/checkedCast.hpp"
60 #include "utilities/formatBuffer.hpp"
61 #include "vmreg_x86.inline.hpp"
62 #ifdef COMPILER1
63 #include "c1/c1_Runtime1.hpp"
64 #endif
65 #ifdef COMPILER2
66 #include "opto/runtime.hpp"
67 #endif
68 #if INCLUDE_JVMCI
69 #include "jvmci/jvmciJavaClasses.hpp"
70 #endif
71
72 #define __ masm->
73
74 #ifdef PRODUCT
75 #define BLOCK_COMMENT(str) /* nothing */
76 #else
77 #define BLOCK_COMMENT(str) __ block_comment(str)
78 #endif // PRODUCT
79
80 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
81
82 class RegisterSaver {
83 // Capture info about frame layout. Layout offsets are in jint
84 // units because compiler frame slots are jints.
85 #define XSAVE_AREA_BEGIN 160
86 #define XSAVE_AREA_YMM_BEGIN 576
87 #define XSAVE_AREA_EGPRS 960
88 #define XSAVE_AREA_OPMASK_BEGIN 1088
89 #define XSAVE_AREA_ZMM_BEGIN 1152
90 #define XSAVE_AREA_UPPERBANK 1664
91 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
92 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
93 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
94 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off
95 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
96 enum layout {
97 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
98 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area
99 DEF_XMM_OFFS(0),
100 DEF_XMM_OFFS(1),
101 // 2..15 are implied in range usage
102 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
103 DEF_YMM_OFFS(0),
104 DEF_YMM_OFFS(1),
105 r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
106 r16H_off,
107 r17_off, r17H_off,
108 r18_off, r18H_off,
109 r19_off, r19H_off,
110 r20_off, r20H_off,
111 r21_off, r21H_off,
112 r22_off, r22H_off,
113 r23_off, r23H_off,
114 r24_off, r24H_off,
115 r25_off, r25H_off,
116 r26_off, r26H_off,
117 r27_off, r27H_off,
118 r28_off, r28H_off,
119 r29_off, r29H_off,
120 r30_off, r30H_off,
121 r31_off, r31H_off,
122 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
123 DEF_OPMASK_OFFS(0),
124 DEF_OPMASK_OFFS(1),
125 // 2..7 are implied in range usage
126 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
127 DEF_ZMM_OFFS(0),
128 DEF_ZMM_OFFS(1),
129 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
130 DEF_ZMM_UPPER_OFFS(16),
131 DEF_ZMM_UPPER_OFFS(17),
132 // 18..31 are implied in range usage
133 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
134 fpu_stateH_end,
135 r15_off, r15H_off,
136 r14_off, r14H_off,
137 r13_off, r13H_off,
138 r12_off, r12H_off,
139 r11_off, r11H_off,
140 r10_off, r10H_off,
141 r9_off, r9H_off,
142 r8_off, r8H_off,
143 rdi_off, rdiH_off,
144 rsi_off, rsiH_off,
145 ignore_off, ignoreH_off, // extra copy of rbp
146 rsp_off, rspH_off,
147 rbx_off, rbxH_off,
148 rdx_off, rdxH_off,
149 rcx_off, rcxH_off,
150 rax_off, raxH_off,
151 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
152 align_off, alignH_off,
153 flags_off, flagsH_off,
154 // The frame sender code expects that rbp will be in the "natural" place and
155 // will override any oopMap setting for it. We must therefore force the layout
156 // so that it agrees with the frame sender code.
157 rbp_off, rbpH_off, // copy of rbp we will restore
158 return_off, returnH_off, // slot for return address
159 reg_save_size // size in compiler stack slots
160 };
161
162 public:
163 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
164 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
165
166 // Offsets into the register save area
167 // Used by deoptimization when it is managing result register
168 // values on its own
169
170 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; }
171 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; }
172 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; }
173 static int r15_offset_in_bytes(void) { return BytesPerInt * r15_off; }
174 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; }
175 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
176
177 // During deoptimization only the result registers need to be restored,
178 // all the other values have already been extracted.
179 static void restore_result_registers(MacroAssembler* masm);
180 };
181
182 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
183 int off = 0;
184 int num_xmm_regs = XMMRegister::available_xmm_registers();
185 #if COMPILER2_OR_JVMCI
186 if (save_wide_vectors && UseAVX == 0) {
187 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
188 }
189 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
190 #else
191 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
192 #endif
193
194 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
195 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
196 // OopMap frame size is in compiler stack slots (jint's) not bytes or words
197 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
198 // CodeBlob frame size is in words.
199 int frame_size_in_words = frame_size_in_bytes / wordSize;
200 *total_frame_words = frame_size_in_words;
201
202 // Save registers, fpu state, and flags.
203 // We assume caller has already pushed the return address onto the
204 // stack, so rsp is 8-byte aligned here.
205 // We push rpb twice in this sequence because we want the real rbp
206 // to be under the return like a normal enter.
207
208 __ enter(); // rsp becomes 16-byte aligned here
209 __ pushf();
210 // Make sure rsp stays 16-byte aligned
211 __ subq(rsp, 8);
212 // Push CPU state in multiple of 16 bytes
213 __ save_legacy_gprs();
214 __ push_FPU_state();
215
216
217 // push cpu state handles this on EVEX enabled targets
218 if (save_wide_vectors) {
219 // Save upper half of YMM registers(0..15)
220 int base_addr = XSAVE_AREA_YMM_BEGIN;
221 for (int n = 0; n < 16; n++) {
222 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
223 }
224 if (VM_Version::supports_evex()) {
225 // Save upper half of ZMM registers(0..15)
226 base_addr = XSAVE_AREA_ZMM_BEGIN;
227 for (int n = 0; n < 16; n++) {
228 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
229 }
230 // Save full ZMM registers(16..num_xmm_regs)
231 base_addr = XSAVE_AREA_UPPERBANK;
232 off = 0;
233 int vector_len = Assembler::AVX_512bit;
234 for (int n = 16; n < num_xmm_regs; n++) {
235 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
236 }
237 #if COMPILER2_OR_JVMCI
238 base_addr = XSAVE_AREA_OPMASK_BEGIN;
239 off = 0;
240 for(int n = 0; n < KRegister::number_of_registers; n++) {
241 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
242 }
243 #endif
244 }
245 } else {
246 if (VM_Version::supports_evex()) {
247 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
248 int base_addr = XSAVE_AREA_UPPERBANK;
249 off = 0;
250 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit;
251 for (int n = 16; n < num_xmm_regs; n++) {
252 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
253 }
254 #if COMPILER2_OR_JVMCI
255 base_addr = XSAVE_AREA_OPMASK_BEGIN;
256 off = 0;
257 for(int n = 0; n < KRegister::number_of_registers; n++) {
258 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
259 }
260 #endif
261 }
262 }
263
264 #if COMPILER2_OR_JVMCI
265 if (UseAPX) {
266 int base_addr = XSAVE_AREA_EGPRS;
267 off = 0;
268 for (int n = 16; n < Register::number_of_registers; n++) {
269 __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
270 }
271 }
272 #endif
273
274 __ vzeroupper();
275 if (frame::arg_reg_save_area_bytes != 0) {
276 // Allocate argument register save area
277 __ subptr(rsp, frame::arg_reg_save_area_bytes);
278 }
279
280 // Set an oopmap for the call site. This oopmap will map all
281 // oop-registers and debug-info registers as callee-saved. This
282 // will allow deoptimization at this safepoint to find all possible
283 // debug-info recordings, as well as let GC find all oops.
284
285 OopMapSet *oop_maps = new OopMapSet();
286 OopMap* map = new OopMap(frame_size_in_slots, 0);
287
288 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
289
290 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
291 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
292 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
293 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
294 // rbp location is known implicitly by the frame sender code, needs no oopmap
295 // and the location where rbp was saved by is ignored
296 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
297 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
298 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg());
299 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg());
300 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
301 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
302 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
303 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
304 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
305 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
306
307 if (UseAPX) {
308 map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
309 map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
310 map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
311 map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
312 map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
313 map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
314 map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
315 map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
316 map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
317 map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
318 map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
319 map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
320 map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
321 map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
322 map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
323 map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
324 }
325 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
326 // on EVEX enabled targets, we get it included in the xsave area
327 off = xmm0_off;
328 int delta = xmm1_off - off;
329 for (int n = 0; n < 16; n++) {
330 XMMRegister xmm_name = as_XMMRegister(n);
331 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
332 off += delta;
333 }
334 if (UseAVX > 2) {
335 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
336 off = zmm16_off;
337 delta = zmm17_off - off;
338 for (int n = 16; n < num_xmm_regs; n++) {
339 XMMRegister zmm_name = as_XMMRegister(n);
340 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
341 off += delta;
342 }
343 }
344
345 #if COMPILER2_OR_JVMCI
346 if (save_wide_vectors) {
347 // Save upper half of YMM registers(0..15)
348 off = ymm0_off;
349 delta = ymm1_off - ymm0_off;
350 for (int n = 0; n < 16; n++) {
351 XMMRegister ymm_name = as_XMMRegister(n);
352 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
353 off += delta;
354 }
355 if (VM_Version::supports_evex()) {
356 // Save upper half of ZMM registers(0..15)
357 off = zmm0_off;
358 delta = zmm1_off - zmm0_off;
359 for (int n = 0; n < 16; n++) {
360 XMMRegister zmm_name = as_XMMRegister(n);
361 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
362 off += delta;
363 }
364 }
365 }
366 #endif // COMPILER2_OR_JVMCI
367
368 // %%% These should all be a waste but we'll keep things as they were for now
369 if (true) {
370 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
371 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
372 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
373 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
374 // rbp location is known implicitly by the frame sender code, needs no oopmap
375 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
376 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
377 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next());
378 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next());
379 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
380 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
381 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
382 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
383 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
384 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
385 if (UseAPX) {
386 map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
387 map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
388 map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
389 map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
390 map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
391 map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
392 map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
393 map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
394 map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
395 map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
396 map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
397 map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
398 map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
399 map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
400 map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
401 map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
402 }
403 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
404 // on EVEX enabled targets, we get it included in the xsave area
405 off = xmm0H_off;
406 delta = xmm1H_off - off;
407 for (int n = 0; n < 16; n++) {
408 XMMRegister xmm_name = as_XMMRegister(n);
409 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
410 off += delta;
411 }
412 if (UseAVX > 2) {
413 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
414 off = zmm16H_off;
415 delta = zmm17H_off - off;
416 for (int n = 16; n < num_xmm_regs; n++) {
417 XMMRegister zmm_name = as_XMMRegister(n);
418 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
419 off += delta;
420 }
421 }
422 }
423
424 return map;
425 }
426
427 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
428 int num_xmm_regs = XMMRegister::available_xmm_registers();
429 if (frame::arg_reg_save_area_bytes != 0) {
430 // Pop arg register save area
431 __ addptr(rsp, frame::arg_reg_save_area_bytes);
432 }
433
434 #if COMPILER2_OR_JVMCI
435 if (restore_wide_vectors) {
436 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
437 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
438 }
439 #else
440 assert(!restore_wide_vectors, "vectors are generated only by C2");
441 #endif
442
443 __ vzeroupper();
444
445 // On EVEX enabled targets everything is handled in pop fpu state
446 if (restore_wide_vectors) {
447 // Restore upper half of YMM registers (0..15)
448 int base_addr = XSAVE_AREA_YMM_BEGIN;
449 for (int n = 0; n < 16; n++) {
450 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
451 }
452 if (VM_Version::supports_evex()) {
453 // Restore upper half of ZMM registers (0..15)
454 base_addr = XSAVE_AREA_ZMM_BEGIN;
455 for (int n = 0; n < 16; n++) {
456 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
457 }
458 // Restore full ZMM registers(16..num_xmm_regs)
459 base_addr = XSAVE_AREA_UPPERBANK;
460 int vector_len = Assembler::AVX_512bit;
461 int off = 0;
462 for (int n = 16; n < num_xmm_regs; n++) {
463 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
464 }
465 #if COMPILER2_OR_JVMCI
466 base_addr = XSAVE_AREA_OPMASK_BEGIN;
467 off = 0;
468 for (int n = 0; n < KRegister::number_of_registers; n++) {
469 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
470 }
471 #endif
472 }
473 } else {
474 if (VM_Version::supports_evex()) {
475 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
476 int base_addr = XSAVE_AREA_UPPERBANK;
477 int off = 0;
478 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit;
479 for (int n = 16; n < num_xmm_regs; n++) {
480 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
481 }
482 #if COMPILER2_OR_JVMCI
483 base_addr = XSAVE_AREA_OPMASK_BEGIN;
484 off = 0;
485 for (int n = 0; n < KRegister::number_of_registers; n++) {
486 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
487 }
488 #endif
489 }
490 }
491
492 #if COMPILER2_OR_JVMCI
493 if (UseAPX) {
494 int base_addr = XSAVE_AREA_EGPRS;
495 int off = 0;
496 for (int n = 16; n < Register::number_of_registers; n++) {
497 __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
498 }
499 }
500 #endif
501
502 // Recover CPU state
503 __ pop_FPU_state();
504 __ restore_legacy_gprs();
505 __ addq(rsp, 8);
506 __ popf();
507 // Get the rbp described implicitly by the calling convention (no oopMap)
508 __ pop(rbp);
509 }
510
511 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
512
513 // Just restore result register. Only used by deoptimization. By
514 // now any callee save register that needs to be restored to a c2
515 // caller of the deoptee has been extracted into the vframeArray
516 // and will be stuffed into the c2i adapter we create for later
517 // restoration so only result registers need to be restored here.
518
519 // Restore fp result register
520 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
521 // Restore integer result register
522 __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
523 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
524
525 // Pop all of the register save are off the stack except the return address
526 __ addptr(rsp, return_offset_in_bytes());
527 }
528
529 // Is vector's size (in bytes) bigger than a size saved by default?
530 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
531 bool SharedRuntime::is_wide_vector(int size) {
532 return size > 16;
533 }
534
535 // ---------------------------------------------------------------------------
536 // Read the array of BasicTypes from a signature, and compute where the
537 // arguments should go. Values in the VMRegPair regs array refer to 4-byte
538 // quantities. Values less than VMRegImpl::stack0 are registers, those above
539 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer
540 // as framesizes are fixed.
541 // VMRegImpl::stack0 refers to the first slot 0(sp).
542 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
543 // Register up to Register::number_of_registers are the 64-bit
544 // integer registers.
545
546 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
547 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit
548 // units regardless of build. Of course for i486 there is no 64 bit build
549
550 // The Java calling convention is a "shifted" version of the C ABI.
551 // By skipping the first C ABI register we can call non-static jni methods
552 // with small numbers of arguments without having to shuffle the arguments
553 // at all. Since we control the java ABI we ought to at least get some
554 // advantage out of it.
555
556 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
557 VMRegPair *regs,
558 int total_args_passed) {
559
560 // Create the mapping between argument positions and
561 // registers.
562 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
563 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
564 };
565 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
566 j_farg0, j_farg1, j_farg2, j_farg3,
567 j_farg4, j_farg5, j_farg6, j_farg7
568 };
569
570
571 uint int_args = 0;
572 uint fp_args = 0;
573 uint stk_args = 0;
574
575 for (int i = 0; i < total_args_passed; i++) {
576 switch (sig_bt[i]) {
577 case T_BOOLEAN:
578 case T_CHAR:
579 case T_BYTE:
580 case T_SHORT:
581 case T_INT:
582 if (int_args < Argument::n_int_register_parameters_j) {
583 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
584 } else {
585 stk_args = align_up(stk_args, 2);
586 regs[i].set1(VMRegImpl::stack2reg(stk_args));
587 stk_args += 1;
588 }
589 break;
590 case T_VOID:
591 // halves of T_LONG or T_DOUBLE
592 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
593 regs[i].set_bad();
594 break;
595 case T_LONG:
596 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
597 // fall through
598 case T_OBJECT:
599 case T_ARRAY:
600 case T_ADDRESS:
601 if (int_args < Argument::n_int_register_parameters_j) {
602 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
603 } else {
604 stk_args = align_up(stk_args, 2);
605 regs[i].set2(VMRegImpl::stack2reg(stk_args));
606 stk_args += 2;
607 }
608 break;
609 case T_FLOAT:
610 if (fp_args < Argument::n_float_register_parameters_j) {
611 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
612 } else {
613 stk_args = align_up(stk_args, 2);
614 regs[i].set1(VMRegImpl::stack2reg(stk_args));
615 stk_args += 1;
616 }
617 break;
618 case T_DOUBLE:
619 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
620 if (fp_args < Argument::n_float_register_parameters_j) {
621 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
622 } else {
623 stk_args = align_up(stk_args, 2);
624 regs[i].set2(VMRegImpl::stack2reg(stk_args));
625 stk_args += 2;
626 }
627 break;
628 default:
629 ShouldNotReachHere();
630 break;
631 }
632 }
633
634 return stk_args;
635 }
636
637 // Patch the callers callsite with entry to compiled code if it exists.
638 static void patch_callers_callsite(MacroAssembler *masm) {
639 Label L;
640 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
641 __ jcc(Assembler::equal, L);
642
643 // Save the current stack pointer
644 __ mov(r13, rsp);
645 // Schedule the branch target address early.
646 // Call into the VM to patch the caller, then jump to compiled callee
647 // rax isn't live so capture return address while we easily can
648 __ movptr(rax, Address(rsp, 0));
649
650 // align stack so push_CPU_state doesn't fault
651 __ andptr(rsp, -(StackAlignmentInBytes));
652 __ push_CPU_state();
653 __ vzeroupper();
654 // VM needs caller's callsite
655 // VM needs target method
656 // This needs to be a long call since we will relocate this adapter to
657 // the codeBuffer and it may not reach
658
659 // Allocate argument register save area
660 if (frame::arg_reg_save_area_bytes != 0) {
661 __ subptr(rsp, frame::arg_reg_save_area_bytes);
662 }
663 __ mov(c_rarg0, rbx);
664 __ mov(c_rarg1, rax);
665 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
666
667 // De-allocate argument register save area
668 if (frame::arg_reg_save_area_bytes != 0) {
669 __ addptr(rsp, frame::arg_reg_save_area_bytes);
670 }
671
672 __ vzeroupper();
673 __ pop_CPU_state();
674 // restore sp
675 __ mov(rsp, r13);
676 __ bind(L);
677 }
678
679 static void gen_c2i_adapter(MacroAssembler *masm,
680 int total_args_passed,
681 int comp_args_on_stack,
682 const BasicType *sig_bt,
683 const VMRegPair *regs,
684 Label& skip_fixup) {
685 // Before we get into the guts of the C2I adapter, see if we should be here
686 // at all. We've come from compiled code and are attempting to jump to the
687 // interpreter, which means the caller made a static call to get here
688 // (vcalls always get a compiled target if there is one). Check for a
689 // compiled target. If there is one, we need to patch the caller's call.
690 patch_callers_callsite(masm);
691
692 __ bind(skip_fixup);
693
694 // Since all args are passed on the stack, total_args_passed *
695 // Interpreter::stackElementSize is the space we need.
696
697 assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
698
699 int extraspace = (total_args_passed * Interpreter::stackElementSize);
700
701 // stack is aligned, keep it that way
702 // This is not currently needed or enforced by the interpreter, but
703 // we might as well conform to the ABI.
704 extraspace = align_up(extraspace, 2*wordSize);
705
706 // set senderSP value
707 __ lea(r13, Address(rsp, wordSize));
708
709 #ifdef ASSERT
710 __ check_stack_alignment(r13, "sender stack not aligned");
711 #endif
712 if (extraspace > 0) {
713 // Pop the return address
714 __ pop(rax);
715
716 __ subptr(rsp, extraspace);
717
718 // Push the return address
719 __ push(rax);
720
721 // Account for the return address location since we store it first rather
722 // than hold it in a register across all the shuffling
723 extraspace += wordSize;
724 }
725
726 #ifdef ASSERT
727 __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
728 #endif
729
730 // Now write the args into the outgoing interpreter space
731 for (int i = 0; i < total_args_passed; i++) {
732 if (sig_bt[i] == T_VOID) {
733 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
734 continue;
735 }
736
737 // offset to start parameters
738 int st_off = (total_args_passed - i) * Interpreter::stackElementSize;
739 int next_off = st_off - Interpreter::stackElementSize;
740
741 // Say 4 args:
742 // i st_off
743 // 0 32 T_LONG
744 // 1 24 T_VOID
745 // 2 16 T_OBJECT
746 // 3 8 T_BOOL
747 // - 0 return address
748 //
749 // However to make thing extra confusing. Because we can fit a long/double in
750 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
751 // leaves one slot empty and only stores to a single slot. In this case the
752 // slot that is occupied is the T_VOID slot. See I said it was confusing.
753
754 VMReg r_1 = regs[i].first();
755 VMReg r_2 = regs[i].second();
756 if (!r_1->is_valid()) {
757 assert(!r_2->is_valid(), "");
758 continue;
759 }
760 if (r_1->is_stack()) {
761 // memory to memory use rax
762 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
763 if (!r_2->is_valid()) {
764 // sign extend??
765 __ movl(rax, Address(rsp, ld_off));
766 __ movptr(Address(rsp, st_off), rax);
767
768 } else {
769
770 __ movq(rax, Address(rsp, ld_off));
771
772 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
773 // T_DOUBLE and T_LONG use two slots in the interpreter
774 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
775 // ld_off == LSW, ld_off+wordSize == MSW
776 // st_off == MSW, next_off == LSW
777 __ movq(Address(rsp, next_off), rax);
778 #ifdef ASSERT
779 // Overwrite the unused slot with known junk
780 __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
781 __ movptr(Address(rsp, st_off), rax);
782 #endif /* ASSERT */
783 } else {
784 __ movq(Address(rsp, st_off), rax);
785 }
786 }
787 } else if (r_1->is_Register()) {
788 Register r = r_1->as_Register();
789 if (!r_2->is_valid()) {
790 // must be only an int (or less ) so move only 32bits to slot
791 // why not sign extend??
792 __ movl(Address(rsp, st_off), r);
793 } else {
794 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
795 // T_DOUBLE and T_LONG use two slots in the interpreter
796 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
797 // long/double in gpr
798 #ifdef ASSERT
799 // Overwrite the unused slot with known junk
800 __ mov64(rax, CONST64(0xdeadffffdeadaaab));
801 __ movptr(Address(rsp, st_off), rax);
802 #endif /* ASSERT */
803 __ movq(Address(rsp, next_off), r);
804 } else {
805 __ movptr(Address(rsp, st_off), r);
806 }
807 }
808 } else {
809 assert(r_1->is_XMMRegister(), "");
810 if (!r_2->is_valid()) {
811 // only a float use just part of the slot
812 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
813 } else {
814 #ifdef ASSERT
815 // Overwrite the unused slot with known junk
816 __ mov64(rax, CONST64(0xdeadffffdeadaaac));
817 __ movptr(Address(rsp, st_off), rax);
818 #endif /* ASSERT */
819 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
820 }
821 }
822 }
823
824 // Schedule the branch target address early.
825 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
826 __ jmp(rcx);
827 }
828
829 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
830 int total_args_passed,
831 int comp_args_on_stack,
832 const BasicType *sig_bt,
833 const VMRegPair *regs) {
834
835 // Note: r13 contains the senderSP on entry. We must preserve it since
836 // we may do a i2c -> c2i transition if we lose a race where compiled
837 // code goes non-entrant while we get args ready.
838 // In addition we use r13 to locate all the interpreter args as
839 // we must align the stack to 16 bytes on an i2c entry else we
840 // lose alignment we expect in all compiled code and register
841 // save code can segv when fxsave instructions find improperly
842 // aligned stack pointer.
843
844 // Adapters can be frameless because they do not require the caller
845 // to perform additional cleanup work, such as correcting the stack pointer.
846 // An i2c adapter is frameless because the *caller* frame, which is interpreted,
847 // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
848 // even if a callee has modified the stack pointer.
849 // A c2i adapter is frameless because the *callee* frame, which is interpreted,
850 // routinely repairs its caller's stack pointer (from sender_sp, which is set
851 // up via the senderSP register).
852 // In other words, if *either* the caller or callee is interpreted, we can
853 // get the stack pointer repaired after a call.
854 // This is why c2i and i2c adapters cannot be indefinitely composed.
855 // In particular, if a c2i adapter were to somehow call an i2c adapter,
856 // both caller and callee would be compiled methods, and neither would
857 // clean up the stack pointer changes performed by the two adapters.
858 // If this happens, control eventually transfers back to the compiled
859 // caller, but with an uncorrected stack, causing delayed havoc.
860
861 // Must preserve original SP for loading incoming arguments because
862 // we need to align the outgoing SP for compiled code.
863 __ movptr(r11, rsp);
864
865 // Pick up the return address
866 __ pop(rax);
867
868 // Convert 4-byte c2 stack slots to words.
869 int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
870
871 if (comp_args_on_stack) {
872 __ subptr(rsp, comp_words_on_stack * wordSize);
873 }
874
875 // Ensure compiled code always sees stack at proper alignment
876 __ andptr(rsp, -16);
877
878 // push the return address and misalign the stack that youngest frame always sees
879 // as far as the placement of the call instruction
880 __ push(rax);
881
882 // Put saved SP in another register
883 const Register saved_sp = rax;
884 __ movptr(saved_sp, r11);
885
886 // Will jump to the compiled code just as if compiled code was doing it.
887 // Pre-load the register-jump target early, to schedule it better.
888 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
889
890 #if INCLUDE_JVMCI
891 if (EnableJVMCI) {
892 // check if this call should be routed towards a specific entry point
893 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
894 Label no_alternative_target;
895 __ jcc(Assembler::equal, no_alternative_target);
896 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
897 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
898 __ bind(no_alternative_target);
899 }
900 #endif // INCLUDE_JVMCI
901
902 // Now generate the shuffle code. Pick up all register args and move the
903 // rest through the floating point stack top.
904 for (int i = 0; i < total_args_passed; i++) {
905 if (sig_bt[i] == T_VOID) {
906 // Longs and doubles are passed in native word order, but misaligned
907 // in the 32-bit build.
908 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
909 continue;
910 }
911
912 // Pick up 0, 1 or 2 words from SP+offset.
913
914 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
915 "scrambled load targets?");
916 // Load in argument order going down.
917 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
918 // Point to interpreter value (vs. tag)
919 int next_off = ld_off - Interpreter::stackElementSize;
920 //
921 //
922 //
923 VMReg r_1 = regs[i].first();
924 VMReg r_2 = regs[i].second();
925 if (!r_1->is_valid()) {
926 assert(!r_2->is_valid(), "");
927 continue;
928 }
929 if (r_1->is_stack()) {
930 // Convert stack slot to an SP offset (+ wordSize to account for return address )
931 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
932
933 // We can use r13 as a temp here because compiled code doesn't need r13 as an input
934 // and if we end up going thru a c2i because of a miss a reasonable value of r13
935 // will be generated.
936 if (!r_2->is_valid()) {
937 // sign extend???
938 __ movl(r13, Address(saved_sp, ld_off));
939 __ movptr(Address(rsp, st_off), r13);
940 } else {
941 //
942 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
943 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
944 // So we must adjust where to pick up the data to match the interpreter.
945 //
946 // Interpreter local[n] == MSW, local[n+1] == LSW however locals
947 // are accessed as negative so LSW is at LOW address
948
949 // ld_off is MSW so get LSW
950 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
951 next_off : ld_off;
952 __ movq(r13, Address(saved_sp, offset));
953 // st_off is LSW (i.e. reg.first())
954 __ movq(Address(rsp, st_off), r13);
955 }
956 } else if (r_1->is_Register()) { // Register argument
957 Register r = r_1->as_Register();
958 assert(r != rax, "must be different");
959 if (r_2->is_valid()) {
960 //
961 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
962 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
963 // So we must adjust where to pick up the data to match the interpreter.
964
965 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
966 next_off : ld_off;
967
968 // this can be a misaligned move
969 __ movq(r, Address(saved_sp, offset));
970 } else {
971 // sign extend and use a full word?
972 __ movl(r, Address(saved_sp, ld_off));
973 }
974 } else {
975 if (!r_2->is_valid()) {
976 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
977 } else {
978 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
979 }
980 }
981 }
982
983 __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
984
985 // 6243940 We might end up in handle_wrong_method if
986 // the callee is deoptimized as we race thru here. If that
987 // happens we don't want to take a safepoint because the
988 // caller frame will look interpreted and arguments are now
989 // "compiled" so it is much better to make this transition
990 // invisible to the stack walking code. Unfortunately if
991 // we try and find the callee by normal means a safepoint
992 // is possible. So we stash the desired callee in the thread
993 // and the vm will find there should this case occur.
994
995 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
996
997 // put Method* where a c2i would expect should we end up there
998 // only needed because eof c2 resolve stubs return Method* as a result in
999 // rax
1000 __ mov(rax, rbx);
1001 __ jmp(r11);
1002 }
1003
1004 // ---------------------------------------------------------------
1005 void SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
1006 int total_args_passed,
1007 int comp_args_on_stack,
1008 const BasicType *sig_bt,
1009 const VMRegPair *regs,
1010 address entry_address[AdapterBlob::ENTRY_COUNT]) {
1011 entry_address[AdapterBlob::I2C] = __ pc();
1012
1013 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
1014
1015 // -------------------------------------------------------------------------
1016 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls
1017 // to the interpreter. The args start out packed in the compiled layout. They
1018 // need to be unpacked into the interpreter layout. This will almost always
1019 // require some stack space. We grow the current (compiled) stack, then repack
1020 // the args. We finally end in a jump to the generic interpreter entry point.
1021 // On exit from the interpreter, the interpreter will restore our SP (lest the
1022 // compiled code, which relies solely on SP and not RBP, get sick).
1023
1024 entry_address[AdapterBlob::C2I_Unverified] = __ pc();
1025 Label skip_fixup;
1026
1027 Register data = rax;
1028 Register receiver = j_rarg0;
1029 Register temp = rbx;
1030
1031 {
1032 __ ic_check(1 /* end_alignment */);
1033 __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1034 // Method might have been compiled since the call site was patched to
1035 // interpreted if that is the case treat it as a miss so we can get
1036 // the call site corrected.
1037 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1038 __ jcc(Assembler::equal, skip_fixup);
1039 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1040 }
1041
1042 entry_address[AdapterBlob::C2I] = __ pc();
1043
1044 // Class initialization barrier for static methods
1045 entry_address[AdapterBlob::C2I_No_Clinit_Check] = nullptr;
1046 if (VM_Version::supports_fast_class_init_checks()) {
1047 Label L_skip_barrier;
1048 Register method = rbx;
1049
1050 { // Bypass the barrier for non-static methods
1051 Register flags = rscratch1;
1052 __ load_unsigned_short(flags, Address(method, Method::access_flags_offset()));
1053 __ testl(flags, JVM_ACC_STATIC);
1054 __ jcc(Assembler::zero, L_skip_barrier); // non-static
1055 }
1056
1057 Register klass = rscratch1;
1058 __ load_method_holder(klass, method);
1059 __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
1060
1061 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1062
1063 __ bind(L_skip_barrier);
1064 entry_address[AdapterBlob::C2I_No_Clinit_Check] = __ pc();
1065 }
1066
1067 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1068 bs->c2i_entry_barrier(masm);
1069
1070 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1071 return;
1072 }
1073
1074 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1075 VMRegPair *regs,
1076 int total_args_passed) {
1077
1078 // We return the amount of VMRegImpl stack slots we need to reserve for all
1079 // the arguments NOT counting out_preserve_stack_slots.
1080
1081 // NOTE: These arrays will have to change when c1 is ported
1082 #ifdef _WIN64
1083 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1084 c_rarg0, c_rarg1, c_rarg2, c_rarg3
1085 };
1086 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1087 c_farg0, c_farg1, c_farg2, c_farg3
1088 };
1089 #else
1090 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1091 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1092 };
1093 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1094 c_farg0, c_farg1, c_farg2, c_farg3,
1095 c_farg4, c_farg5, c_farg6, c_farg7
1096 };
1097 #endif // _WIN64
1098
1099
1100 uint int_args = 0;
1101 uint fp_args = 0;
1102 uint stk_args = 0; // inc by 2 each time
1103
1104 for (int i = 0; i < total_args_passed; i++) {
1105 switch (sig_bt[i]) {
1106 case T_BOOLEAN:
1107 case T_CHAR:
1108 case T_BYTE:
1109 case T_SHORT:
1110 case T_INT:
1111 if (int_args < Argument::n_int_register_parameters_c) {
1112 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1113 #ifdef _WIN64
1114 fp_args++;
1115 // Allocate slots for callee to stuff register args the stack.
1116 stk_args += 2;
1117 #endif
1118 } else {
1119 regs[i].set1(VMRegImpl::stack2reg(stk_args));
1120 stk_args += 2;
1121 }
1122 break;
1123 case T_LONG:
1124 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1125 // fall through
1126 case T_OBJECT:
1127 case T_ARRAY:
1128 case T_ADDRESS:
1129 case T_METADATA:
1130 if (int_args < Argument::n_int_register_parameters_c) {
1131 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1132 #ifdef _WIN64
1133 fp_args++;
1134 stk_args += 2;
1135 #endif
1136 } else {
1137 regs[i].set2(VMRegImpl::stack2reg(stk_args));
1138 stk_args += 2;
1139 }
1140 break;
1141 case T_FLOAT:
1142 if (fp_args < Argument::n_float_register_parameters_c) {
1143 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1144 #ifdef _WIN64
1145 int_args++;
1146 // Allocate slots for callee to stuff register args the stack.
1147 stk_args += 2;
1148 #endif
1149 } else {
1150 regs[i].set1(VMRegImpl::stack2reg(stk_args));
1151 stk_args += 2;
1152 }
1153 break;
1154 case T_DOUBLE:
1155 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1156 if (fp_args < Argument::n_float_register_parameters_c) {
1157 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1158 #ifdef _WIN64
1159 int_args++;
1160 // Allocate slots for callee to stuff register args the stack.
1161 stk_args += 2;
1162 #endif
1163 } else {
1164 regs[i].set2(VMRegImpl::stack2reg(stk_args));
1165 stk_args += 2;
1166 }
1167 break;
1168 case T_VOID: // Halves of longs and doubles
1169 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1170 regs[i].set_bad();
1171 break;
1172 default:
1173 ShouldNotReachHere();
1174 break;
1175 }
1176 }
1177 #ifdef _WIN64
1178 // windows abi requires that we always allocate enough stack space
1179 // for 4 64bit registers to be stored down.
1180 if (stk_args < 8) {
1181 stk_args = 8;
1182 }
1183 #endif // _WIN64
1184
1185 return stk_args;
1186 }
1187
1188 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1189 uint num_bits,
1190 uint total_args_passed) {
1191 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1192 "only certain vector sizes are supported for now");
1193
1194 static const XMMRegister VEC_ArgReg[32] = {
1195 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
1196 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1197 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1198 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1199 };
1200
1201 uint stk_args = 0;
1202 uint fp_args = 0;
1203
1204 for (uint i = 0; i < total_args_passed; i++) {
1205 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1206 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15));
1207 regs[i].set_pair(vmreg->next(next_val), vmreg);
1208 }
1209
1210 return stk_args;
1211 }
1212
1213 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1214 // We always ignore the frame_slots arg and just use the space just below frame pointer
1215 // which by this time is free to use
1216 switch (ret_type) {
1217 case T_FLOAT:
1218 __ movflt(Address(rbp, -wordSize), xmm0);
1219 break;
1220 case T_DOUBLE:
1221 __ movdbl(Address(rbp, -wordSize), xmm0);
1222 break;
1223 case T_VOID: break;
1224 default: {
1225 __ movptr(Address(rbp, -wordSize), rax);
1226 }
1227 }
1228 }
1229
1230 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1231 // We always ignore the frame_slots arg and just use the space just below frame pointer
1232 // which by this time is free to use
1233 switch (ret_type) {
1234 case T_FLOAT:
1235 __ movflt(xmm0, Address(rbp, -wordSize));
1236 break;
1237 case T_DOUBLE:
1238 __ movdbl(xmm0, Address(rbp, -wordSize));
1239 break;
1240 case T_VOID: break;
1241 default: {
1242 __ movptr(rax, Address(rbp, -wordSize));
1243 }
1244 }
1245 }
1246
1247 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1248 for ( int i = first_arg ; i < arg_count ; i++ ) {
1249 if (args[i].first()->is_Register()) {
1250 __ push(args[i].first()->as_Register());
1251 } else if (args[i].first()->is_XMMRegister()) {
1252 __ subptr(rsp, 2*wordSize);
1253 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1254 }
1255 }
1256 }
1257
1258 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1259 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1260 if (args[i].first()->is_Register()) {
1261 __ pop(args[i].first()->as_Register());
1262 } else if (args[i].first()->is_XMMRegister()) {
1263 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1264 __ addptr(rsp, 2*wordSize);
1265 }
1266 }
1267 }
1268
1269 static void verify_oop_args(MacroAssembler* masm,
1270 const methodHandle& method,
1271 const BasicType* sig_bt,
1272 const VMRegPair* regs) {
1273 Register temp_reg = rbx; // not part of any compiled calling seq
1274 if (VerifyOops) {
1275 for (int i = 0; i < method->size_of_parameters(); i++) {
1276 if (is_reference_type(sig_bt[i])) {
1277 VMReg r = regs[i].first();
1278 assert(r->is_valid(), "bad oop arg");
1279 if (r->is_stack()) {
1280 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1281 __ verify_oop(temp_reg);
1282 } else {
1283 __ verify_oop(r->as_Register());
1284 }
1285 }
1286 }
1287 }
1288 }
1289
1290 static void check_continuation_enter_argument(VMReg actual_vmreg,
1291 Register expected_reg,
1292 const char* name) {
1293 assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1294 assert(actual_vmreg->as_Register() == expected_reg,
1295 "%s is in unexpected register: %s instead of %s",
1296 name, actual_vmreg->as_Register()->name(), expected_reg->name());
1297 }
1298
1299
1300 //---------------------------- continuation_enter_setup ---------------------------
1301 //
1302 // Arguments:
1303 // None.
1304 //
1305 // Results:
1306 // rsp: pointer to blank ContinuationEntry
1307 //
1308 // Kills:
1309 // rax
1310 //
1311 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1312 assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1313 assert(in_bytes(ContinuationEntry::cont_offset()) % VMRegImpl::stack_slot_size == 0, "");
1314 assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1315
1316 stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1317 __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1318
1319 int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1320 OopMap* map = new OopMap(frame_size, 0);
1321
1322 __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1323 __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1324 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1325
1326 return map;
1327 }
1328
1329 //---------------------------- fill_continuation_entry ---------------------------
1330 //
1331 // Arguments:
1332 // rsp: pointer to blank Continuation entry
1333 // reg_cont_obj: pointer to the continuation
1334 // reg_flags: flags
1335 //
1336 // Results:
1337 // rsp: pointer to filled out ContinuationEntry
1338 //
1339 // Kills:
1340 // rax
1341 //
1342 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1343 assert_different_registers(rax, reg_cont_obj, reg_flags);
1344 #ifdef ASSERT
1345 __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1346 #endif
1347 __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1348 __ movl (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1349 __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1350 __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1351 __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1352
1353 __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1354 __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1355
1356 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1357 }
1358
1359 //---------------------------- continuation_enter_cleanup ---------------------------
1360 //
1361 // Arguments:
1362 // rsp: pointer to the ContinuationEntry
1363 //
1364 // Results:
1365 // rsp: pointer to the spilled rbp in the entry frame
1366 //
1367 // Kills:
1368 // rbx
1369 //
1370 static void continuation_enter_cleanup(MacroAssembler* masm) {
1371 #ifdef ASSERT
1372 Label L_good_sp;
1373 __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1374 __ jcc(Assembler::equal, L_good_sp);
1375 __ stop("Incorrect rsp at continuation_enter_cleanup");
1376 __ bind(L_good_sp);
1377 #endif
1378 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1379 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1380 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1381 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1382 __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1383 }
1384
1385 static void gen_continuation_enter(MacroAssembler* masm,
1386 const VMRegPair* regs,
1387 int& exception_offset,
1388 OopMapSet* oop_maps,
1389 int& frame_complete,
1390 int& stack_slots,
1391 int& interpreted_entry_offset,
1392 int& compiled_entry_offset) {
1393
1394 // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1395 int pos_cont_obj = 0;
1396 int pos_is_cont = 1;
1397 int pos_is_virtual = 2;
1398
1399 // The platform-specific calling convention may present the arguments in various registers.
1400 // To simplify the rest of the code, we expect the arguments to reside at these known
1401 // registers, and we additionally check the placement here in case calling convention ever
1402 // changes.
1403 Register reg_cont_obj = c_rarg1;
1404 Register reg_is_cont = c_rarg2;
1405 Register reg_is_virtual = c_rarg3;
1406
1407 check_continuation_enter_argument(regs[pos_cont_obj].first(), reg_cont_obj, "Continuation object");
1408 check_continuation_enter_argument(regs[pos_is_cont].first(), reg_is_cont, "isContinue");
1409 check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1410
1411 // Utility methods kill rax, make sure there are no collisions
1412 assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1413
1414 AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1415 relocInfo::static_call_type);
1416
1417 address start = __ pc();
1418
1419 Label L_thaw, L_exit;
1420
1421 // i2i entry used at interp_only_mode only
1422 interpreted_entry_offset = __ pc() - start;
1423 {
1424 #ifdef ASSERT
1425 Label is_interp_only;
1426 __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1427 __ jcc(Assembler::notEqual, is_interp_only);
1428 __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1429 __ bind(is_interp_only);
1430 #endif
1431
1432 __ pop(rax); // return address
1433 // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1434 __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1435 __ movl(c_rarg2, Address(rsp, Interpreter::stackElementSize*1));
1436 __ movl(c_rarg3, Address(rsp, Interpreter::stackElementSize*0));
1437 __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1438 __ push(rax); // return address
1439 __ push_cont_fastpath();
1440
1441 __ enter();
1442
1443 stack_slots = 2; // will be adjusted in setup
1444 OopMap* map = continuation_enter_setup(masm, stack_slots);
1445 // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1446 // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1447
1448 __ verify_oop(reg_cont_obj);
1449
1450 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1451
1452 // If continuation, call to thaw. Otherwise, resolve the call and exit.
1453 __ testptr(reg_is_cont, reg_is_cont);
1454 __ jcc(Assembler::notZero, L_thaw);
1455
1456 // --- Resolve path
1457
1458 // Make sure the call is patchable
1459 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1460 // Emit stub for static call
1461 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1462 if (stub == nullptr) {
1463 fatal("CodeCache is full at gen_continuation_enter");
1464 }
1465 __ call(resolve);
1466 oop_maps->add_gc_map(__ pc() - start, map);
1467 __ post_call_nop();
1468
1469 __ jmp(L_exit);
1470 }
1471
1472 // compiled entry
1473 __ align(CodeEntryAlignment);
1474 compiled_entry_offset = __ pc() - start;
1475 __ enter();
1476
1477 stack_slots = 2; // will be adjusted in setup
1478 OopMap* map = continuation_enter_setup(masm, stack_slots);
1479
1480 // Frame is now completed as far as size and linkage.
1481 frame_complete = __ pc() - start;
1482
1483 __ verify_oop(reg_cont_obj);
1484
1485 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1486
1487 // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1488 __ testptr(reg_is_cont, reg_is_cont);
1489 __ jccb(Assembler::notZero, L_thaw);
1490
1491 // --- call Continuation.enter(Continuation c, boolean isContinue)
1492
1493 // Make sure the call is patchable
1494 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1495
1496 // Emit stub for static call
1497 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1498 if (stub == nullptr) {
1499 fatal("CodeCache is full at gen_continuation_enter");
1500 }
1501
1502 // The call needs to be resolved. There's a special case for this in
1503 // SharedRuntime::find_callee_info_helper() which calls
1504 // LinkResolver::resolve_continuation_enter() which resolves the call to
1505 // Continuation.enter(Continuation c, boolean isContinue).
1506 __ call(resolve);
1507
1508 oop_maps->add_gc_map(__ pc() - start, map);
1509 __ post_call_nop();
1510
1511 __ jmpb(L_exit);
1512
1513 // --- Thawing path
1514
1515 __ bind(L_thaw);
1516
1517 ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1518 __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1519
1520 ContinuationEntry::_return_pc_offset = __ pc() - start;
1521 oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1522 __ post_call_nop();
1523
1524 // --- Normal exit (resolve/thawing)
1525
1526 __ bind(L_exit);
1527 ContinuationEntry::_cleanup_offset = __ pc() - start;
1528 continuation_enter_cleanup(masm);
1529 __ pop(rbp);
1530 __ ret(0);
1531
1532 // --- Exception handling path
1533
1534 exception_offset = __ pc() - start;
1535
1536 continuation_enter_cleanup(masm);
1537 __ pop(rbp);
1538
1539 __ movptr(c_rarg0, r15_thread);
1540 __ movptr(c_rarg1, Address(rsp, 0)); // return address
1541
1542 // rax still holds the original exception oop, save it before the call
1543 __ push(rax);
1544
1545 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1546 __ movptr(rbx, rax);
1547
1548 // Continue at exception handler:
1549 // rax: exception oop
1550 // rbx: exception handler
1551 // rdx: exception pc
1552 __ pop(rax);
1553 __ verify_oop(rax);
1554 __ pop(rdx);
1555 __ jmp(rbx);
1556 }
1557
1558 static void gen_continuation_yield(MacroAssembler* masm,
1559 const VMRegPair* regs,
1560 OopMapSet* oop_maps,
1561 int& frame_complete,
1562 int& stack_slots,
1563 int& compiled_entry_offset) {
1564 enum layout {
1565 rbp_off,
1566 rbpH_off,
1567 return_off,
1568 return_off2,
1569 framesize // inclusive of return address
1570 };
1571 stack_slots = framesize / VMRegImpl::slots_per_word;
1572 assert(stack_slots == 2, "recheck layout");
1573
1574 address start = __ pc();
1575 compiled_entry_offset = __ pc() - start;
1576 __ enter();
1577 address the_pc = __ pc();
1578
1579 frame_complete = the_pc - start;
1580
1581 // This nop must be exactly at the PC we push into the frame info.
1582 // We use this nop for fast CodeBlob lookup, associate the OopMap
1583 // with it right away.
1584 __ post_call_nop();
1585 OopMap* map = new OopMap(framesize, 1);
1586 oop_maps->add_gc_map(frame_complete, map);
1587
1588 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1589 __ movptr(c_rarg0, r15_thread);
1590 __ movptr(c_rarg1, rsp);
1591 __ call_VM_leaf(Continuation::freeze_entry(), 2);
1592 __ reset_last_Java_frame(true);
1593
1594 Label L_pinned;
1595
1596 __ testptr(rax, rax);
1597 __ jcc(Assembler::notZero, L_pinned);
1598
1599 __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1600 continuation_enter_cleanup(masm);
1601 __ pop(rbp);
1602 __ ret(0);
1603
1604 __ bind(L_pinned);
1605
1606 // Pinned, return to caller
1607
1608 // handle pending exception thrown by freeze
1609 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1610 Label ok;
1611 __ jcc(Assembler::equal, ok);
1612 __ leave();
1613 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1614 __ bind(ok);
1615
1616 __ leave();
1617 __ ret(0);
1618 }
1619
1620 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1621 ::continuation_enter_cleanup(masm);
1622 }
1623
1624 static void gen_special_dispatch(MacroAssembler* masm,
1625 const methodHandle& method,
1626 const BasicType* sig_bt,
1627 const VMRegPair* regs) {
1628 verify_oop_args(masm, method, sig_bt, regs);
1629 vmIntrinsics::ID iid = method->intrinsic_id();
1630
1631 // Now write the args into the outgoing interpreter space
1632 bool has_receiver = false;
1633 Register receiver_reg = noreg;
1634 int member_arg_pos = -1;
1635 Register member_reg = noreg;
1636 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1637 if (ref_kind != 0) {
1638 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument
1639 member_reg = rbx; // known to be free at this point
1640 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1641 } else if (iid == vmIntrinsics::_invokeBasic) {
1642 has_receiver = true;
1643 } else if (iid == vmIntrinsics::_linkToNative) {
1644 member_arg_pos = method->size_of_parameters() - 1; // trailing NativeEntryPoint argument
1645 member_reg = rbx; // known to be free at this point
1646 } else {
1647 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1648 }
1649
1650 if (member_reg != noreg) {
1651 // Load the member_arg into register, if necessary.
1652 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1653 VMReg r = regs[member_arg_pos].first();
1654 if (r->is_stack()) {
1655 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1656 } else {
1657 // no data motion is needed
1658 member_reg = r->as_Register();
1659 }
1660 }
1661
1662 if (has_receiver) {
1663 // Make sure the receiver is loaded into a register.
1664 assert(method->size_of_parameters() > 0, "oob");
1665 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1666 VMReg r = regs[0].first();
1667 assert(r->is_valid(), "bad receiver arg");
1668 if (r->is_stack()) {
1669 // Porting note: This assumes that compiled calling conventions always
1670 // pass the receiver oop in a register. If this is not true on some
1671 // platform, pick a temp and load the receiver from stack.
1672 fatal("receiver always in a register");
1673 receiver_reg = j_rarg0; // known to be free at this point
1674 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1675 } else {
1676 // no data motion is needed
1677 receiver_reg = r->as_Register();
1678 }
1679 }
1680
1681 // Figure out which address we are really jumping to:
1682 MethodHandles::generate_method_handle_dispatch(masm, iid,
1683 receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1684 }
1685
1686 // ---------------------------------------------------------------------------
1687 // Generate a native wrapper for a given method. The method takes arguments
1688 // in the Java compiled code convention, marshals them to the native
1689 // convention (handlizes oops, etc), transitions to native, makes the call,
1690 // returns to java state (possibly blocking), unhandlizes any result and
1691 // returns.
1692 //
1693 // Critical native functions are a shorthand for the use of
1694 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1695 // functions. The wrapper is expected to unpack the arguments before
1696 // passing them to the callee. Critical native functions leave the state _in_Java,
1697 // since they cannot stop for GC.
1698 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1699 // block and the check for pending exceptions it's impossible for them
1700 // to be thrown.
1701 //
1702 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1703 const methodHandle& method,
1704 int compile_id,
1705 BasicType* in_sig_bt,
1706 VMRegPair* in_regs,
1707 BasicType ret_type) {
1708 if (method->is_continuation_native_intrinsic()) {
1709 int exception_offset = -1;
1710 OopMapSet* oop_maps = new OopMapSet();
1711 int frame_complete = -1;
1712 int stack_slots = -1;
1713 int interpreted_entry_offset = -1;
1714 int vep_offset = -1;
1715 if (method->is_continuation_enter_intrinsic()) {
1716 gen_continuation_enter(masm,
1717 in_regs,
1718 exception_offset,
1719 oop_maps,
1720 frame_complete,
1721 stack_slots,
1722 interpreted_entry_offset,
1723 vep_offset);
1724 } else if (method->is_continuation_yield_intrinsic()) {
1725 gen_continuation_yield(masm,
1726 in_regs,
1727 oop_maps,
1728 frame_complete,
1729 stack_slots,
1730 vep_offset);
1731 } else {
1732 guarantee(false, "Unknown Continuation native intrinsic");
1733 }
1734
1735 #ifdef ASSERT
1736 if (method->is_continuation_enter_intrinsic()) {
1737 assert(interpreted_entry_offset != -1, "Must be set");
1738 assert(exception_offset != -1, "Must be set");
1739 } else {
1740 assert(interpreted_entry_offset == -1, "Must be unset");
1741 assert(exception_offset == -1, "Must be unset");
1742 }
1743 assert(frame_complete != -1, "Must be set");
1744 assert(stack_slots != -1, "Must be set");
1745 assert(vep_offset != -1, "Must be set");
1746 #endif
1747
1748 __ flush();
1749 nmethod* nm = nmethod::new_native_nmethod(method,
1750 compile_id,
1751 masm->code(),
1752 vep_offset,
1753 frame_complete,
1754 stack_slots,
1755 in_ByteSize(-1),
1756 in_ByteSize(-1),
1757 oop_maps,
1758 exception_offset);
1759 if (nm == nullptr) return nm;
1760 if (method->is_continuation_enter_intrinsic()) {
1761 ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1762 } else if (method->is_continuation_yield_intrinsic()) {
1763 _cont_doYield_stub = nm;
1764 }
1765 return nm;
1766 }
1767
1768 if (method->is_method_handle_intrinsic()) {
1769 vmIntrinsics::ID iid = method->intrinsic_id();
1770 intptr_t start = (intptr_t)__ pc();
1771 int vep_offset = ((intptr_t)__ pc()) - start;
1772 gen_special_dispatch(masm,
1773 method,
1774 in_sig_bt,
1775 in_regs);
1776 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period
1777 __ flush();
1778 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually
1779 return nmethod::new_native_nmethod(method,
1780 compile_id,
1781 masm->code(),
1782 vep_offset,
1783 frame_complete,
1784 stack_slots / VMRegImpl::slots_per_word,
1785 in_ByteSize(-1),
1786 in_ByteSize(-1),
1787 nullptr);
1788 }
1789 address native_func = method->native_function();
1790 assert(native_func != nullptr, "must have function");
1791
1792 // An OopMap for lock (and class if static)
1793 OopMapSet *oop_maps = new OopMapSet();
1794 intptr_t start = (intptr_t)__ pc();
1795
1796 // We have received a description of where all the java arg are located
1797 // on entry to the wrapper. We need to convert these args to where
1798 // the jni function will expect them. To figure out where they go
1799 // we convert the java signature to a C signature by inserting
1800 // the hidden arguments as arg[0] and possibly arg[1] (static method)
1801
1802 const int total_in_args = method->size_of_parameters();
1803 int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1804
1805 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1806 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1807
1808 int argc = 0;
1809 out_sig_bt[argc++] = T_ADDRESS;
1810 if (method->is_static()) {
1811 out_sig_bt[argc++] = T_OBJECT;
1812 }
1813
1814 for (int i = 0; i < total_in_args ; i++ ) {
1815 out_sig_bt[argc++] = in_sig_bt[i];
1816 }
1817
1818 // Now figure out where the args must be stored and how much stack space
1819 // they require.
1820 int out_arg_slots;
1821 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
1822
1823 // Compute framesize for the wrapper. We need to handlize all oops in
1824 // incoming registers
1825
1826 // Calculate the total number of stack slots we will need.
1827
1828 // First count the abi requirement plus all of the outgoing args
1829 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1830
1831 // Now the space for the inbound oop handle area
1832 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers
1833
1834 int oop_handle_offset = stack_slots;
1835 stack_slots += total_save_slots;
1836
1837 // Now any space we need for handlizing a klass if static method
1838
1839 int klass_slot_offset = 0;
1840 int klass_offset = -1;
1841 int lock_slot_offset = 0;
1842 bool is_static = false;
1843
1844 if (method->is_static()) {
1845 klass_slot_offset = stack_slots;
1846 stack_slots += VMRegImpl::slots_per_word;
1847 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1848 is_static = true;
1849 }
1850
1851 // Plus a lock if needed
1852
1853 if (method->is_synchronized()) {
1854 lock_slot_offset = stack_slots;
1855 stack_slots += VMRegImpl::slots_per_word;
1856 }
1857
1858 // Now a place (+2) to save return values or temp during shuffling
1859 // + 4 for return address (which we own) and saved rbp
1860 stack_slots += 6;
1861
1862 // Ok The space we have allocated will look like:
1863 //
1864 //
1865 // FP-> | |
1866 // |---------------------|
1867 // | 2 slots for moves |
1868 // |---------------------|
1869 // | lock box (if sync) |
1870 // |---------------------| <- lock_slot_offset
1871 // | klass (if static) |
1872 // |---------------------| <- klass_slot_offset
1873 // | oopHandle area |
1874 // |---------------------| <- oop_handle_offset (6 java arg registers)
1875 // | outbound memory |
1876 // | based arguments |
1877 // | |
1878 // |---------------------|
1879 // | |
1880 // SP-> | out_preserved_slots |
1881 //
1882 //
1883
1884
1885 // Now compute actual number of stack words we need rounding to make
1886 // stack properly aligned.
1887 stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1888
1889 int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1890
1891 // First thing make an ic check to see if we should even be here
1892
1893 // We are free to use all registers as temps without saving them and
1894 // restoring them except rbp. rbp is the only callee save register
1895 // as far as the interpreter and the compiler(s) are concerned.
1896
1897 const Register receiver = j_rarg0;
1898
1899 Label exception_pending;
1900
1901 assert_different_registers(receiver, rscratch1, rscratch2);
1902 __ verify_oop(receiver);
1903 __ ic_check(8 /* end_alignment */);
1904
1905 int vep_offset = ((intptr_t)__ pc()) - start;
1906
1907 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1908 Label L_skip_barrier;
1909 Register klass = r10;
1910 __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1911 __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
1912
1913 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1914
1915 __ bind(L_skip_barrier);
1916 }
1917
1918 #ifdef COMPILER1
1919 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1920 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1921 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1922 }
1923 #endif // COMPILER1
1924
1925 // The instruction at the verified entry point must be 5 bytes or longer
1926 // because it can be patched on the fly by make_non_entrant. The stack bang
1927 // instruction fits that requirement.
1928
1929 // Generate stack overflow check
1930 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1931
1932 // Generate a new frame for the wrapper.
1933 __ enter();
1934 // -2 because return address is already present and so is saved rbp
1935 __ subptr(rsp, stack_size - 2*wordSize);
1936
1937 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1938 // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
1939 bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
1940
1941 // Frame is now completed as far as size and linkage.
1942 int frame_complete = ((intptr_t)__ pc()) - start;
1943
1944 #ifdef ASSERT
1945 __ check_stack_alignment(rsp, "improperly aligned stack");
1946 #endif /* ASSERT */
1947
1948
1949 // We use r14 as the oop handle for the receiver/klass
1950 // It is callee save so it survives the call to native
1951
1952 const Register oop_handle_reg = r14;
1953
1954 //
1955 // We immediately shuffle the arguments so that any vm call we have to
1956 // make from here on out (sync slow path, jvmti, etc.) we will have
1957 // captured the oops from our caller and have a valid oopMap for
1958 // them.
1959
1960 // -----------------
1961 // The Grand Shuffle
1962
1963 // The Java calling convention is either equal (linux) or denser (win64) than the
1964 // c calling convention. However the because of the jni_env argument the c calling
1965 // convention always has at least one more (and two for static) arguments than Java.
1966 // Therefore if we move the args from java -> c backwards then we will never have
1967 // a register->register conflict and we don't have to build a dependency graph
1968 // and figure out how to break any cycles.
1969 //
1970
1971 // Record esp-based slot for receiver on stack for non-static methods
1972 int receiver_offset = -1;
1973
1974 // This is a trick. We double the stack slots so we can claim
1975 // the oops in the caller's frame. Since we are sure to have
1976 // more args than the caller doubling is enough to make
1977 // sure we can capture all the incoming oop args from the
1978 // caller.
1979 //
1980 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1981
1982 // Mark location of rbp (someday)
1983 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1984
1985 // Use eax, ebx as temporaries during any memory-memory moves we have to do
1986 // All inbound args are referenced based on rbp and all outbound args via rsp.
1987
1988
1989 #ifdef ASSERT
1990 bool reg_destroyed[Register::number_of_registers];
1991 bool freg_destroyed[XMMRegister::number_of_registers];
1992 for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
1993 reg_destroyed[r] = false;
1994 }
1995 for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
1996 freg_destroyed[f] = false;
1997 }
1998
1999 #endif /* ASSERT */
2000
2001 // For JNI natives the incoming and outgoing registers are offset upwards.
2002 GrowableArray<int> arg_order(2 * total_in_args);
2003
2004 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2005 arg_order.push(i);
2006 arg_order.push(c_arg);
2007 }
2008
2009 for (int ai = 0; ai < arg_order.length(); ai += 2) {
2010 int i = arg_order.at(ai);
2011 int c_arg = arg_order.at(ai + 1);
2012 __ block_comment(err_msg("move %d -> %d", i, c_arg));
2013 #ifdef ASSERT
2014 if (in_regs[i].first()->is_Register()) {
2015 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2016 } else if (in_regs[i].first()->is_XMMRegister()) {
2017 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2018 }
2019 if (out_regs[c_arg].first()->is_Register()) {
2020 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2021 } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2022 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2023 }
2024 #endif /* ASSERT */
2025 switch (in_sig_bt[i]) {
2026 case T_ARRAY:
2027 case T_OBJECT:
2028 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2029 ((i == 0) && (!is_static)),
2030 &receiver_offset);
2031 break;
2032 case T_VOID:
2033 break;
2034
2035 case T_FLOAT:
2036 __ float_move(in_regs[i], out_regs[c_arg]);
2037 break;
2038
2039 case T_DOUBLE:
2040 assert( i + 1 < total_in_args &&
2041 in_sig_bt[i + 1] == T_VOID &&
2042 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2043 __ double_move(in_regs[i], out_regs[c_arg]);
2044 break;
2045
2046 case T_LONG :
2047 __ long_move(in_regs[i], out_regs[c_arg]);
2048 break;
2049
2050 case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2051
2052 default:
2053 __ move32_64(in_regs[i], out_regs[c_arg]);
2054 }
2055 }
2056
2057 int c_arg;
2058
2059 // Pre-load a static method's oop into r14. Used both by locking code and
2060 // the normal JNI call code.
2061 // point c_arg at the first arg that is already loaded in case we
2062 // need to spill before we call out
2063 c_arg = total_c_args - total_in_args;
2064
2065 if (method->is_static()) {
2066
2067 // load oop into a register
2068 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2069
2070 // Now handlize the static class mirror it's known not-null.
2071 __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2072 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2073
2074 // Now get the handle
2075 __ lea(oop_handle_reg, Address(rsp, klass_offset));
2076 // store the klass handle as second argument
2077 __ movptr(c_rarg1, oop_handle_reg);
2078 // and protect the arg if we must spill
2079 c_arg--;
2080 }
2081
2082 // Change state to native (we save the return address in the thread, since it might not
2083 // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2084 // points into the right code segment. It does not have to be the correct return pc.
2085 // We use the same pc/oopMap repeatedly when we call out
2086
2087 Label native_return;
2088 if (method->is_object_wait0()) {
2089 // For convenience we use the pc we want to resume to in case of preemption on Object.wait.
2090 __ set_last_Java_frame(rsp, noreg, native_return, rscratch1);
2091 } else {
2092 intptr_t the_pc = (intptr_t) __ pc();
2093 oop_maps->add_gc_map(the_pc - start, map);
2094
2095 __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1);
2096 }
2097
2098 // We have all of the arguments setup at this point. We must not touch any register
2099 // argument registers at this point (what if we save/restore them there are no oop?
2100
2101 if (DTraceMethodProbes) {
2102 // protect the args we've loaded
2103 save_args(masm, total_c_args, c_arg, out_regs);
2104 __ mov_metadata(c_rarg1, method());
2105 __ call_VM_leaf(
2106 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2107 r15_thread, c_rarg1);
2108 restore_args(masm, total_c_args, c_arg, out_regs);
2109 }
2110
2111 // RedefineClasses() tracing support for obsolete method entry
2112 if (log_is_enabled(Trace, redefine, class, obsolete)) {
2113 // protect the args we've loaded
2114 save_args(masm, total_c_args, c_arg, out_regs);
2115 __ mov_metadata(c_rarg1, method());
2116 __ call_VM_leaf(
2117 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2118 r15_thread, c_rarg1);
2119 restore_args(masm, total_c_args, c_arg, out_regs);
2120 }
2121
2122 // Lock a synchronized method
2123
2124 // Register definitions used by locking and unlocking
2125
2126 const Register swap_reg = rax; // Must use rax for cmpxchg instruction
2127 const Register obj_reg = rbx; // Will contain the oop
2128 const Register lock_reg = r13; // Address of compiler lock object (BasicLock)
2129
2130 Label slow_path_lock;
2131 Label lock_done;
2132
2133 if (method->is_synchronized()) {
2134 // Get the handle (the 2nd argument)
2135 __ mov(oop_handle_reg, c_rarg1);
2136
2137 // Get address of the box
2138
2139 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2140
2141 // Load the oop from the handle
2142 __ movptr(obj_reg, Address(oop_handle_reg, 0));
2143
2144 __ lightweight_lock(lock_reg, obj_reg, swap_reg, rscratch1, slow_path_lock);
2145
2146 // Slow path will re-enter here
2147 __ bind(lock_done);
2148 }
2149
2150 // Finally just about ready to make the JNI call
2151
2152 // get JNIEnv* which is first argument to native
2153 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2154
2155 // Now set thread in native
2156 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2157
2158 __ call(RuntimeAddress(native_func));
2159
2160 // Verify or restore cpu control state after JNI call
2161 __ restore_cpu_control_state_after_jni(rscratch1);
2162
2163 // Unpack native results.
2164 switch (ret_type) {
2165 case T_BOOLEAN: __ c2bool(rax); break;
2166 case T_CHAR : __ movzwl(rax, rax); break;
2167 case T_BYTE : __ sign_extend_byte (rax); break;
2168 case T_SHORT : __ sign_extend_short(rax); break;
2169 case T_INT : /* nothing to do */ break;
2170 case T_DOUBLE :
2171 case T_FLOAT :
2172 // Result is in xmm0 we'll save as needed
2173 break;
2174 case T_ARRAY: // Really a handle
2175 case T_OBJECT: // Really a handle
2176 break; // can't de-handlize until after safepoint check
2177 case T_VOID: break;
2178 case T_LONG: break;
2179 default : ShouldNotReachHere();
2180 }
2181
2182 // Switch thread to "native transition" state before reading the synchronization state.
2183 // This additional state is necessary because reading and testing the synchronization
2184 // state is not atomic w.r.t. GC, as this scenario demonstrates:
2185 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2186 // VM thread changes sync state to synchronizing and suspends threads for GC.
2187 // Thread A is resumed to finish this native method, but doesn't block here since it
2188 // didn't see any synchronization is progress, and escapes.
2189 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2190
2191 // Force this write out before the read below
2192 if (!UseSystemMemoryBarrier) {
2193 __ membar(Assembler::Membar_mask_bits(
2194 Assembler::LoadLoad | Assembler::LoadStore |
2195 Assembler::StoreLoad | Assembler::StoreStore));
2196 }
2197
2198 // check for safepoint operation in progress and/or pending suspend requests
2199 {
2200 Label Continue;
2201 Label slow_path;
2202
2203 __ safepoint_poll(slow_path, true /* at_return */, false /* in_nmethod */);
2204
2205 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2206 __ jcc(Assembler::equal, Continue);
2207 __ bind(slow_path);
2208
2209 // Don't use call_VM as it will see a possible pending exception and forward it
2210 // and never return here preventing us from clearing _last_native_pc down below.
2211 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2212 // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2213 // by hand.
2214 //
2215 __ vzeroupper();
2216 save_native_result(masm, ret_type, stack_slots);
2217 __ mov(c_rarg0, r15_thread);
2218 __ mov(r12, rsp); // remember sp
2219 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2220 __ andptr(rsp, -16); // align stack as required by ABI
2221 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2222 __ mov(rsp, r12); // restore sp
2223 __ reinit_heapbase();
2224 // Restore any method result value
2225 restore_native_result(masm, ret_type, stack_slots);
2226 __ bind(Continue);
2227 }
2228
2229 // change thread state
2230 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2231
2232 if (method->is_object_wait0()) {
2233 // Check preemption for Object.wait()
2234 __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset()));
2235 __ cmpptr(rscratch1, NULL_WORD);
2236 __ jccb(Assembler::equal, native_return);
2237 __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD);
2238 __ jmp(rscratch1);
2239 __ bind(native_return);
2240
2241 intptr_t the_pc = (intptr_t) __ pc();
2242 oop_maps->add_gc_map(the_pc - start, map);
2243 }
2244
2245
2246 Label reguard;
2247 Label reguard_done;
2248 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2249 __ jcc(Assembler::equal, reguard);
2250 __ bind(reguard_done);
2251
2252 // native result if any is live
2253
2254 // Unlock
2255 Label slow_path_unlock;
2256 Label unlock_done;
2257 if (method->is_synchronized()) {
2258
2259 Label fast_done;
2260
2261 // Get locked oop from the handle we passed to jni
2262 __ movptr(obj_reg, Address(oop_handle_reg, 0));
2263
2264 // Must save rax if it is live now because cmpxchg must use it
2265 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2266 save_native_result(masm, ret_type, stack_slots);
2267 }
2268
2269 __ lightweight_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2270
2271 // slow path re-enters here
2272 __ bind(unlock_done);
2273 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2274 restore_native_result(masm, ret_type, stack_slots);
2275 }
2276
2277 __ bind(fast_done);
2278 }
2279 if (DTraceMethodProbes) {
2280 save_native_result(masm, ret_type, stack_slots);
2281 __ mov_metadata(c_rarg1, method());
2282 __ call_VM_leaf(
2283 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2284 r15_thread, c_rarg1);
2285 restore_native_result(masm, ret_type, stack_slots);
2286 }
2287
2288 __ reset_last_Java_frame(false);
2289
2290 // Unbox oop result, e.g. JNIHandles::resolve value.
2291 if (is_reference_type(ret_type)) {
2292 __ resolve_jobject(rax /* value */,
2293 rcx /* tmp */);
2294 }
2295
2296 if (CheckJNICalls) {
2297 // clear_pending_jni_exception_check
2298 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2299 }
2300
2301 // reset handle block
2302 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2303 __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2304
2305 // pop our frame
2306
2307 __ leave();
2308
2309 #if INCLUDE_JFR
2310 // We need to do a poll test after unwind in case the sampler
2311 // managed to sample the native frame after returning to Java.
2312 Label L_return;
2313 address poll_test_pc = __ pc();
2314 __ relocate(relocInfo::poll_return_type);
2315 __ testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2316 __ jccb(Assembler::zero, L_return);
2317 __ lea(rscratch1, InternalAddress(poll_test_pc));
2318 __ movptr(Address(r15_thread, JavaThread::saved_exception_pc_offset()), rscratch1);
2319 assert(SharedRuntime::polling_page_return_handler_blob() != nullptr,
2320 "polling page return stub not created yet");
2321 address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
2322 __ jump(RuntimeAddress(stub));
2323 __ bind(L_return);
2324 #endif // INCLUDE_JFR
2325
2326 // Any exception pending?
2327 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2328 __ jcc(Assembler::notEqual, exception_pending);
2329
2330 // Return
2331
2332 __ ret(0);
2333
2334 // Unexpected paths are out of line and go here
2335
2336 // forward the exception
2337 __ bind(exception_pending);
2338
2339 // and forward the exception
2340 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2341
2342 // Slow path locking & unlocking
2343 if (method->is_synchronized()) {
2344
2345 // BEGIN Slow path lock
2346 __ bind(slow_path_lock);
2347
2348 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2349 // args are (oop obj, BasicLock* lock, JavaThread* thread)
2350
2351 // protect the args we've loaded
2352 save_args(masm, total_c_args, c_arg, out_regs);
2353
2354 __ mov(c_rarg0, obj_reg);
2355 __ mov(c_rarg1, lock_reg);
2356 __ mov(c_rarg2, r15_thread);
2357
2358 // Not a leaf but we have last_Java_frame setup as we want.
2359 // We don't want to unmount in case of contention since that would complicate preserving
2360 // the arguments that had already been marshalled into the native convention. So we force
2361 // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame())
2362 // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack.
2363 __ push_cont_fastpath();
2364 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2365 __ pop_cont_fastpath();
2366 restore_args(masm, total_c_args, c_arg, out_regs);
2367
2368 #ifdef ASSERT
2369 { Label L;
2370 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2371 __ jcc(Assembler::equal, L);
2372 __ stop("no pending exception allowed on exit from monitorenter");
2373 __ bind(L);
2374 }
2375 #endif
2376 __ jmp(lock_done);
2377
2378 // END Slow path lock
2379
2380 // BEGIN Slow path unlock
2381 __ bind(slow_path_unlock);
2382
2383 // If we haven't already saved the native result we must save it now as xmm registers
2384 // are still exposed.
2385 __ vzeroupper();
2386 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2387 save_native_result(masm, ret_type, stack_slots);
2388 }
2389
2390 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2391
2392 __ mov(c_rarg0, obj_reg);
2393 __ mov(c_rarg2, r15_thread);
2394 __ mov(r12, rsp); // remember sp
2395 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2396 __ andptr(rsp, -16); // align stack as required by ABI
2397
2398 // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2399 // NOTE that obj_reg == rbx currently
2400 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2401 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2402
2403 // args are (oop obj, BasicLock* lock, JavaThread* thread)
2404 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2405 __ mov(rsp, r12); // restore sp
2406 __ reinit_heapbase();
2407 #ifdef ASSERT
2408 {
2409 Label L;
2410 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2411 __ jcc(Assembler::equal, L);
2412 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2413 __ bind(L);
2414 }
2415 #endif /* ASSERT */
2416
2417 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2418
2419 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2420 restore_native_result(masm, ret_type, stack_slots);
2421 }
2422 __ jmp(unlock_done);
2423
2424 // END Slow path unlock
2425
2426 } // synchronized
2427
2428 // SLOW PATH Reguard the stack if needed
2429
2430 __ bind(reguard);
2431 __ vzeroupper();
2432 save_native_result(masm, ret_type, stack_slots);
2433 __ mov(r12, rsp); // remember sp
2434 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2435 __ andptr(rsp, -16); // align stack as required by ABI
2436 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2437 __ mov(rsp, r12); // restore sp
2438 __ reinit_heapbase();
2439 restore_native_result(masm, ret_type, stack_slots);
2440 // and continue
2441 __ jmp(reguard_done);
2442
2443
2444
2445 __ flush();
2446
2447 nmethod *nm = nmethod::new_native_nmethod(method,
2448 compile_id,
2449 masm->code(),
2450 vep_offset,
2451 frame_complete,
2452 stack_slots / VMRegImpl::slots_per_word,
2453 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2454 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2455 oop_maps);
2456
2457 return nm;
2458 }
2459
2460 // this function returns the adjust size (in number of words) to a c2i adapter
2461 // activation for use during deoptimization
2462 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2463 return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2464 }
2465
2466
2467 uint SharedRuntime::out_preserve_stack_slots() {
2468 return 0;
2469 }
2470
2471
2472 // Number of stack slots between incoming argument block and the start of
2473 // a new frame. The PROLOG must add this many slots to the stack. The
2474 // EPILOG must remove this many slots. amd64 needs two slots for
2475 // return address.
2476 uint SharedRuntime::in_preserve_stack_slots() {
2477 return 4 + 2 * VerifyStackAtCalls;
2478 }
2479
2480 VMReg SharedRuntime::thread_register() {
2481 return r15_thread->as_VMReg();
2482 }
2483
2484 //------------------------------generate_deopt_blob----------------------------
2485 void SharedRuntime::generate_deopt_blob() {
2486 // Allocate space for the code
2487 ResourceMark rm;
2488 // Setup code generation tools
2489 int pad = 0;
2490 if (UseAVX > 2) {
2491 pad += 1024;
2492 }
2493 if (UseAPX) {
2494 pad += 1024;
2495 }
2496 #if INCLUDE_JVMCI
2497 if (EnableJVMCI) {
2498 pad += 512; // Increase the buffer size when compiling for JVMCI
2499 }
2500 #endif
2501 const char* name = SharedRuntime::stub_name(StubId::shared_deopt_id);
2502 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
2503 if (blob != nullptr) {
2504 _deopt_blob = blob->as_deoptimization_blob();
2505 return;
2506 }
2507
2508 CodeBuffer buffer(name, 2560+pad, 1024);
2509 MacroAssembler* masm = new MacroAssembler(&buffer);
2510 int frame_size_in_words;
2511 OopMap* map = nullptr;
2512 OopMapSet *oop_maps = new OopMapSet();
2513
2514 // -------------
2515 // This code enters when returning to a de-optimized nmethod. A return
2516 // address has been pushed on the stack, and return values are in
2517 // registers.
2518 // If we are doing a normal deopt then we were called from the patched
2519 // nmethod from the point we returned to the nmethod. So the return
2520 // address on the stack is wrong by NativeCall::instruction_size
2521 // We will adjust the value so it looks like we have the original return
2522 // address on the stack (like when we eagerly deoptimized).
2523 // In the case of an exception pending when deoptimizing, we enter
2524 // with a return address on the stack that points after the call we patched
2525 // into the exception handler. We have the following register state from,
2526 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2527 // rax: exception oop
2528 // rbx: exception handler
2529 // rdx: throwing pc
2530 // So in this case we simply jam rdx into the useless return address and
2531 // the stack looks just like we want.
2532 //
2533 // At this point we need to de-opt. We save the argument return
2534 // registers. We call the first C routine, fetch_unroll_info(). This
2535 // routine captures the return values and returns a structure which
2536 // describes the current frame size and the sizes of all replacement frames.
2537 // The current frame is compiled code and may contain many inlined
2538 // functions, each with their own JVM state. We pop the current frame, then
2539 // push all the new frames. Then we call the C routine unpack_frames() to
2540 // populate these frames. Finally unpack_frames() returns us the new target
2541 // address. Notice that callee-save registers are BLOWN here; they have
2542 // already been captured in the vframeArray at the time the return PC was
2543 // patched.
2544 address start = __ pc();
2545 Label cont;
2546
2547 // Prolog for non exception case!
2548
2549 // Save everything in sight.
2550 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2551
2552 // Normal deoptimization. Save exec mode for unpack_frames.
2553 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2554 __ jmp(cont);
2555
2556 int reexecute_offset = __ pc() - start;
2557 #if INCLUDE_JVMCI && !defined(COMPILER1)
2558 if (UseJVMCICompiler) {
2559 // JVMCI does not use this kind of deoptimization
2560 __ should_not_reach_here();
2561 }
2562 #endif
2563
2564 // Reexecute case
2565 // return address is the pc describes what bci to do re-execute at
2566
2567 // No need to update map as each call to save_live_registers will produce identical oopmap
2568 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2569
2570 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2571 __ jmp(cont);
2572
2573 #if INCLUDE_JVMCI
2574 Label after_fetch_unroll_info_call;
2575 int implicit_exception_uncommon_trap_offset = 0;
2576 int uncommon_trap_offset = 0;
2577
2578 if (EnableJVMCI) {
2579 implicit_exception_uncommon_trap_offset = __ pc() - start;
2580
2581 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2582 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2583
2584 uncommon_trap_offset = __ pc() - start;
2585
2586 // Save everything in sight.
2587 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2588 // fetch_unroll_info needs to call last_java_frame()
2589 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2590
2591 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2592 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2593
2594 __ movl(r14, Deoptimization::Unpack_reexecute);
2595 __ mov(c_rarg0, r15_thread);
2596 __ movl(c_rarg2, r14); // exec mode
2597 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2598 oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2599
2600 __ reset_last_Java_frame(false);
2601
2602 __ jmp(after_fetch_unroll_info_call);
2603 } // EnableJVMCI
2604 #endif // INCLUDE_JVMCI
2605
2606 int exception_offset = __ pc() - start;
2607
2608 // Prolog for exception case
2609
2610 // all registers are dead at this entry point, except for rax, and
2611 // rdx which contain the exception oop and exception pc
2612 // respectively. Set them in TLS and fall thru to the
2613 // unpack_with_exception_in_tls entry point.
2614
2615 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2616 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2617
2618 int exception_in_tls_offset = __ pc() - start;
2619
2620 // new implementation because exception oop is now passed in JavaThread
2621
2622 // Prolog for exception case
2623 // All registers must be preserved because they might be used by LinearScan
2624 // Exceptiop oop and throwing PC are passed in JavaThread
2625 // tos: stack at point of call to method that threw the exception (i.e. only
2626 // args are on the stack, no return address)
2627
2628 // make room on stack for the return address
2629 // It will be patched later with the throwing pc. The correct value is not
2630 // available now because loading it from memory would destroy registers.
2631 __ push(0);
2632
2633 // Save everything in sight.
2634 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2635
2636 // Now it is safe to overwrite any register
2637
2638 // Deopt during an exception. Save exec mode for unpack_frames.
2639 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2640
2641 // load throwing pc from JavaThread and patch it as the return address
2642 // of the current frame. Then clear the field in JavaThread
2643
2644 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2645 __ movptr(Address(rbp, wordSize), rdx);
2646 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2647
2648 #ifdef ASSERT
2649 // verify that there is really an exception oop in JavaThread
2650 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2651 __ verify_oop(rax);
2652
2653 // verify that there is no pending exception
2654 Label no_pending_exception;
2655 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2656 __ testptr(rax, rax);
2657 __ jcc(Assembler::zero, no_pending_exception);
2658 __ stop("must not have pending exception here");
2659 __ bind(no_pending_exception);
2660 #endif
2661
2662 __ bind(cont);
2663
2664 // Call C code. Need thread and this frame, but NOT official VM entry
2665 // crud. We cannot block on this call, no GC can happen.
2666 //
2667 // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2668
2669 // fetch_unroll_info needs to call last_java_frame().
2670
2671 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2672 #ifdef ASSERT
2673 { Label L;
2674 __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2675 __ jcc(Assembler::equal, L);
2676 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2677 __ bind(L);
2678 }
2679 #endif // ASSERT
2680 __ mov(c_rarg0, r15_thread);
2681 __ movl(c_rarg1, r14); // exec_mode
2682 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2683
2684 // Need to have an oopmap that tells fetch_unroll_info where to
2685 // find any register it might need.
2686 oop_maps->add_gc_map(__ pc() - start, map);
2687
2688 __ reset_last_Java_frame(false);
2689
2690 #if INCLUDE_JVMCI
2691 if (EnableJVMCI) {
2692 __ bind(after_fetch_unroll_info_call);
2693 }
2694 #endif
2695
2696 // Load UnrollBlock* into rdi
2697 __ mov(rdi, rax);
2698
2699 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2700 Label noException;
2701 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending?
2702 __ jcc(Assembler::notEqual, noException);
2703 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2704 // QQQ this is useless it was null above
2705 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2706 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2707 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2708
2709 __ verify_oop(rax);
2710
2711 // Overwrite the result registers with the exception results.
2712 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2713 // I think this is useless
2714 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2715
2716 __ bind(noException);
2717
2718 // Only register save data is on the stack.
2719 // Now restore the result registers. Everything else is either dead
2720 // or captured in the vframeArray.
2721 RegisterSaver::restore_result_registers(masm);
2722
2723 // All of the register save area has been popped of the stack. Only the
2724 // return address remains.
2725
2726 // Pop all the frames we must move/replace.
2727 //
2728 // Frame picture (youngest to oldest)
2729 // 1: self-frame (no frame link)
2730 // 2: deopting frame (no frame link)
2731 // 3: caller of deopting frame (could be compiled/interpreted).
2732 //
2733 // Note: by leaving the return address of self-frame on the stack
2734 // and using the size of frame 2 to adjust the stack
2735 // when we are done the return to frame 3 will still be on the stack.
2736
2737 // Pop deoptimized frame
2738 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2739 __ addptr(rsp, rcx);
2740
2741 // rsp should be pointing at the return address to the caller (3)
2742
2743 // Pick up the initial fp we should save
2744 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2745 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2746
2747 #ifdef ASSERT
2748 // Compilers generate code that bang the stack by as much as the
2749 // interpreter would need. So this stack banging should never
2750 // trigger a fault. Verify that it does not on non product builds.
2751 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2752 __ bang_stack_size(rbx, rcx);
2753 #endif
2754
2755 // Load address of array of frame pcs into rcx
2756 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2757
2758 // Trash the old pc
2759 __ addptr(rsp, wordSize);
2760
2761 // Load address of array of frame sizes into rsi
2762 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2763
2764 // Load counter into rdx
2765 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2766
2767 // Now adjust the caller's stack to make up for the extra locals
2768 // but record the original sp so that we can save it in the skeletal interpreter
2769 // frame and the stack walking of interpreter_sender will get the unextended sp
2770 // value and not the "real" sp value.
2771
2772 const Register sender_sp = r8;
2773
2774 __ mov(sender_sp, rsp);
2775 __ movl(rbx, Address(rdi,
2776 Deoptimization::UnrollBlock::
2777 caller_adjustment_offset()));
2778 __ subptr(rsp, rbx);
2779
2780 // Push interpreter frames in a loop
2781 Label loop;
2782 __ bind(loop);
2783 __ movptr(rbx, Address(rsi, 0)); // Load frame size
2784 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand
2785 __ pushptr(Address(rcx, 0)); // Save return address
2786 __ enter(); // Save old & set new ebp
2787 __ subptr(rsp, rbx); // Prolog
2788 // This value is corrected by layout_activation_impl
2789 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2790 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2791 __ mov(sender_sp, rsp); // Pass sender_sp to next frame
2792 __ addptr(rsi, wordSize); // Bump array pointer (sizes)
2793 __ addptr(rcx, wordSize); // Bump array pointer (pcs)
2794 __ decrementl(rdx); // Decrement counter
2795 __ jcc(Assembler::notZero, loop);
2796 __ pushptr(Address(rcx, 0)); // Save final return address
2797
2798 // Re-push self-frame
2799 __ enter(); // Save old & set new ebp
2800
2801 // Allocate a full sized register save area.
2802 // Return address and rbp are in place, so we allocate two less words.
2803 __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2804
2805 // Restore frame locals after moving the frame
2806 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2807 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2808
2809 // Call C code. Need thread but NOT official VM entry
2810 // crud. We cannot block on this call, no GC can happen. Call should
2811 // restore return values to their stack-slots with the new SP.
2812 //
2813 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2814
2815 // Use rbp because the frames look interpreted now
2816 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2817 // Don't need the precise return PC here, just precise enough to point into this code blob.
2818 address the_pc = __ pc();
2819 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2820
2821 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI
2822 __ mov(c_rarg0, r15_thread);
2823 __ movl(c_rarg1, r14); // second arg: exec_mode
2824 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2825 // Revert SP alignment after call since we're going to do some SP relative addressing below
2826 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2827
2828 // Set an oopmap for the call site
2829 // Use the same PC we used for the last java frame
2830 oop_maps->add_gc_map(the_pc - start,
2831 new OopMap( frame_size_in_words, 0 ));
2832
2833 // Clear fp AND pc
2834 __ reset_last_Java_frame(true);
2835
2836 // Collect return values
2837 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2838 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2839 // I think this is useless (throwing pc?)
2840 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2841
2842 // Pop self-frame.
2843 __ leave(); // Epilog
2844
2845 // Jump to interpreter
2846 __ ret(0);
2847
2848 // Make sure all code is generated
2849 masm->flush();
2850
2851 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2852 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2853 #if INCLUDE_JVMCI
2854 if (EnableJVMCI) {
2855 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2856 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2857 }
2858 #endif
2859
2860 AOTCodeCache::store_code_blob(*_deopt_blob, AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
2861 }
2862
2863 //------------------------------generate_handler_blob------
2864 //
2865 // Generate a special Compile2Runtime blob that saves all registers,
2866 // and setup oopmap.
2867 //
2868 SafepointBlob* SharedRuntime::generate_handler_blob(StubId id, address call_ptr) {
2869 assert(StubRoutines::forward_exception_entry() != nullptr,
2870 "must be generated before");
2871 assert(is_polling_page_id(id), "expected a polling page stub id");
2872
2873 // Allocate space for the code. Setup code generation tools.
2874 const char* name = SharedRuntime::stub_name(id);
2875 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
2876 if (blob != nullptr) {
2877 return blob->as_safepoint_blob();
2878 }
2879
2880 ResourceMark rm;
2881 OopMapSet *oop_maps = new OopMapSet();
2882 OopMap* map;
2883 CodeBuffer buffer(name, 2548, 1024);
2884 MacroAssembler* masm = new MacroAssembler(&buffer);
2885
2886 address start = __ pc();
2887 address call_pc = nullptr;
2888 int frame_size_in_words;
2889 bool cause_return = (id == StubId::shared_polling_page_return_handler_id);
2890 bool save_wide_vectors = (id == StubId::shared_polling_page_vectors_safepoint_handler_id);
2891
2892 // Make room for return address (or push it again)
2893 if (!cause_return) {
2894 __ push(rbx);
2895 }
2896
2897 // Save registers, fpu state, and flags
2898 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
2899
2900 // The following is basically a call_VM. However, we need the precise
2901 // address of the call in order to generate an oopmap. Hence, we do all the
2902 // work ourselves.
2903
2904 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
2905
2906 // The return address must always be correct so that frame constructor never
2907 // sees an invalid pc.
2908
2909 if (!cause_return) {
2910 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
2911 // Additionally, rbx is a callee saved register and we can look at it later to determine
2912 // if someone changed the return address for us!
2913 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
2914 __ movptr(Address(rbp, wordSize), rbx);
2915 }
2916
2917 // Do the call
2918 __ mov(c_rarg0, r15_thread);
2919 __ call(RuntimeAddress(call_ptr));
2920
2921 // Set an oopmap for the call site. This oopmap will map all
2922 // oop-registers and debug-info registers as callee-saved. This
2923 // will allow deoptimization at this safepoint to find all possible
2924 // debug-info recordings, as well as let GC find all oops.
2925
2926 oop_maps->add_gc_map( __ pc() - start, map);
2927
2928 Label noException;
2929
2930 __ reset_last_Java_frame(false);
2931
2932 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
2933 __ jcc(Assembler::equal, noException);
2934
2935 // Exception pending
2936
2937 RegisterSaver::restore_live_registers(masm, save_wide_vectors);
2938
2939 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2940
2941 // No exception case
2942 __ bind(noException);
2943
2944 Label no_adjust;
2945 #ifdef ASSERT
2946 Label bail;
2947 #endif
2948 if (!cause_return) {
2949 Label no_prefix, not_special, check_rex_prefix;
2950
2951 // If our stashed return pc was modified by the runtime we avoid touching it
2952 __ cmpptr(rbx, Address(rbp, wordSize));
2953 __ jcc(Assembler::notEqual, no_adjust);
2954
2955 // Skip over the poll instruction.
2956 // See NativeInstruction::is_safepoint_poll()
2957 // Possible encodings:
2958 // 85 00 test %eax,(%rax)
2959 // 85 01 test %eax,(%rcx)
2960 // 85 02 test %eax,(%rdx)
2961 // 85 03 test %eax,(%rbx)
2962 // 85 06 test %eax,(%rsi)
2963 // 85 07 test %eax,(%rdi)
2964 //
2965 // 41 85 00 test %eax,(%r8)
2966 // 41 85 01 test %eax,(%r9)
2967 // 41 85 02 test %eax,(%r10)
2968 // 41 85 03 test %eax,(%r11)
2969 // 41 85 06 test %eax,(%r14)
2970 // 41 85 07 test %eax,(%r15)
2971 //
2972 // 85 04 24 test %eax,(%rsp)
2973 // 41 85 04 24 test %eax,(%r12)
2974 // 85 45 00 test %eax,0x0(%rbp)
2975 // 41 85 45 00 test %eax,0x0(%r13)
2976 //
2977 // Notes:
2978 // Format of legacy MAP0 test instruction:-
2979 // [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32]
2980 // o For safepoint polling instruction "test %eax,(%rax)", encoding of first register
2981 // operand and base register of memory operand is b/w [0-8), hence we do not require
2982 // additional REX prefix where REX.B bit stores MSB bit of register encoding, which
2983 // is why two bytes encoding is sufficient here.
2984 // o For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE
2985 // register of memory operand is 1000, thus we need additional REX prefix in this case,
2986 // there by adding additional byte to instruction encoding.
2987 // o In case BASE register is one of the 32 extended GPR registers available only on targets
2988 // supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold
2989 // most significant two bits of 5 bit register encoding.
2990
2991 if (VM_Version::supports_apx_f()) {
2992 __ cmpb(Address(rbx, 0), Assembler::REX2);
2993 __ jccb(Assembler::notEqual, check_rex_prefix);
2994 __ addptr(rbx, 2);
2995 __ bind(check_rex_prefix);
2996 }
2997 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
2998 __ jccb(Assembler::notEqual, no_prefix);
2999 __ addptr(rbx, 1);
3000 __ bind(no_prefix);
3001 #ifdef ASSERT
3002 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3003 #endif
3004 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3005 // r12/rsp 0x04
3006 // r13/rbp 0x05
3007 __ movzbq(rcx, Address(rbx, 1));
3008 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3009 __ subptr(rcx, 4); // looking for 0x00 .. 0x01
3010 __ cmpptr(rcx, 1);
3011 __ jccb(Assembler::above, not_special);
3012 __ addptr(rbx, 1);
3013 __ bind(not_special);
3014 #ifdef ASSERT
3015 // Verify the correct encoding of the poll we're about to skip.
3016 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3017 __ jcc(Assembler::notEqual, bail);
3018 // Mask out the modrm bits
3019 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3020 // rax encodes to 0, so if the bits are nonzero it's incorrect
3021 __ jcc(Assembler::notZero, bail);
3022 #endif
3023 // Adjust return pc forward to step over the safepoint poll instruction
3024 __ addptr(rbx, 2);
3025 __ movptr(Address(rbp, wordSize), rbx);
3026 }
3027
3028 __ bind(no_adjust);
3029 // Normal exit, restore registers and exit.
3030 RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3031 __ ret(0);
3032
3033 #ifdef ASSERT
3034 __ bind(bail);
3035 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3036 #endif
3037
3038 // Make sure all code is generated
3039 masm->flush();
3040
3041 // Fill-out other meta info
3042 SafepointBlob* sp_blob = SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3043
3044 AOTCodeCache::store_code_blob(*sp_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3045 return sp_blob;
3046 }
3047
3048 //
3049 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3050 //
3051 // Generate a stub that calls into vm to find out the proper destination
3052 // of a java call. All the argument registers are live at this point
3053 // but since this is generic code we don't know what they are and the caller
3054 // must do any gc of the args.
3055 //
3056 RuntimeStub* SharedRuntime::generate_resolve_blob(StubId id, address destination) {
3057 assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3058 assert(is_resolve_id(id), "expected a resolve stub id");
3059
3060 const char* name = SharedRuntime::stub_name(id);
3061 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3062 if (blob != nullptr) {
3063 return blob->as_runtime_stub();
3064 }
3065
3066 // allocate space for the code
3067 ResourceMark rm;
3068 CodeBuffer buffer(name, 1552, 512);
3069 MacroAssembler* masm = new MacroAssembler(&buffer);
3070
3071 int frame_size_in_words;
3072
3073 OopMapSet *oop_maps = new OopMapSet();
3074 OopMap* map = nullptr;
3075
3076 int start = __ offset();
3077
3078 // No need to save vector registers since they are caller-saved anyway.
3079 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3080
3081 int frame_complete = __ offset();
3082
3083 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3084
3085 __ mov(c_rarg0, r15_thread);
3086
3087 __ call(RuntimeAddress(destination));
3088
3089
3090 // Set an oopmap for the call site.
3091 // We need this not only for callee-saved registers, but also for volatile
3092 // registers that the compiler might be keeping live across a safepoint.
3093
3094 oop_maps->add_gc_map( __ offset() - start, map);
3095
3096 // rax contains the address we are going to jump to assuming no exception got installed
3097
3098 // clear last_Java_sp
3099 __ reset_last_Java_frame(false);
3100 // check for pending exceptions
3101 Label pending;
3102 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3103 __ jcc(Assembler::notEqual, pending);
3104
3105 // get the returned Method*
3106 __ get_vm_result_metadata(rbx);
3107 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3108
3109 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3110
3111 RegisterSaver::restore_live_registers(masm);
3112
3113 // We are back to the original state on entry and ready to go.
3114
3115 __ jmp(rax);
3116
3117 // Pending exception after the safepoint
3118
3119 __ bind(pending);
3120
3121 RegisterSaver::restore_live_registers(masm);
3122
3123 // exception pending => remove activation and forward to exception handler
3124
3125 __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
3126
3127 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3128 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3129
3130 // -------------
3131 // make sure all code is generated
3132 masm->flush();
3133
3134 // return the blob
3135 // frame_size_words or bytes??
3136 RuntimeStub* rs_blob = RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3137
3138 AOTCodeCache::store_code_blob(*rs_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3139 return rs_blob;
3140 }
3141
3142 // Continuation point for throwing of implicit exceptions that are
3143 // not handled in the current activation. Fabricates an exception
3144 // oop and initiates normal exception dispatching in this
3145 // frame. Since we need to preserve callee-saved values (currently
3146 // only for C2, but done for C1 as well) we need a callee-saved oop
3147 // map and therefore have to make these stubs into RuntimeStubs
3148 // rather than BufferBlobs. If the compiler needs all registers to
3149 // be preserved between the fault point and the exception handler
3150 // then it must assume responsibility for that in
3151 // AbstractCompiler::continuation_for_implicit_null_exception or
3152 // continuation_for_implicit_division_by_zero_exception. All other
3153 // implicit exceptions (e.g., NullPointerException or
3154 // AbstractMethodError on entry) are either at call sites or
3155 // otherwise assume that stack unwinding will be initiated, so
3156 // caller saved registers were assumed volatile in the compiler.
3157 RuntimeStub* SharedRuntime::generate_throw_exception(StubId id, address runtime_entry) {
3158 assert(is_throw_id(id), "expected a throw stub id");
3159
3160 const char* name = SharedRuntime::stub_name(id);
3161
3162 // Information about frame layout at time of blocking runtime call.
3163 // Note that we only have to preserve callee-saved registers since
3164 // the compilers are responsible for supplying a continuation point
3165 // if they expect all registers to be preserved.
3166 enum layout {
3167 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3168 rbp_off2,
3169 return_off,
3170 return_off2,
3171 framesize // inclusive of return address
3172 };
3173
3174 int insts_size = 512;
3175 int locs_size = 64;
3176
3177 const char* timer_msg = "SharedRuntime generate_throw_exception";
3178 TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3179
3180 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3181 if (blob != nullptr) {
3182 return blob->as_runtime_stub();
3183 }
3184
3185 ResourceMark rm;
3186 CodeBuffer code(name, insts_size, locs_size);
3187 OopMapSet* oop_maps = new OopMapSet();
3188 MacroAssembler* masm = new MacroAssembler(&code);
3189
3190 address start = __ pc();
3191
3192 // This is an inlined and slightly modified version of call_VM
3193 // which has the ability to fetch the return PC out of
3194 // thread-local storage and also sets up last_Java_sp slightly
3195 // differently than the real call_VM
3196
3197 __ enter(); // required for proper stackwalking of RuntimeStub frame
3198
3199 assert(is_even(framesize/2), "sp not 16-byte aligned");
3200
3201 // return address and rbp are already in place
3202 __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3203
3204 int frame_complete = __ pc() - start;
3205
3206 // Set up last_Java_sp and last_Java_fp
3207 address the_pc = __ pc();
3208 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3209 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack
3210
3211 // Call runtime
3212 __ movptr(c_rarg0, r15_thread);
3213 BLOCK_COMMENT("call runtime_entry");
3214 __ call(RuntimeAddress(runtime_entry));
3215
3216 // Generate oop map
3217 OopMap* map = new OopMap(framesize, 0);
3218
3219 oop_maps->add_gc_map(the_pc - start, map);
3220
3221 __ reset_last_Java_frame(true);
3222
3223 __ leave(); // required for proper stackwalking of RuntimeStub frame
3224
3225 // check for pending exceptions
3226 #ifdef ASSERT
3227 Label L;
3228 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3229 __ jcc(Assembler::notEqual, L);
3230 __ should_not_reach_here();
3231 __ bind(L);
3232 #endif // ASSERT
3233 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3234
3235
3236 // codeBlob framesize is in words (not VMRegImpl::slot_size)
3237 RuntimeStub* stub =
3238 RuntimeStub::new_runtime_stub(name,
3239 &code,
3240 frame_complete,
3241 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3242 oop_maps, false);
3243 AOTCodeCache::store_code_blob(*stub, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3244
3245 return stub;
3246 }
3247
3248 //------------------------------Montgomery multiplication------------------------
3249 //
3250
3251 #ifndef _WINDOWS
3252
3253 // Subtract 0:b from carry:a. Return carry.
3254 static julong
3255 sub(julong a[], julong b[], julong carry, long len) {
3256 long long i = 0, cnt = len;
3257 julong tmp;
3258 asm volatile("clc; "
3259 "0: ; "
3260 "mov (%[b], %[i], 8), %[tmp]; "
3261 "sbb %[tmp], (%[a], %[i], 8); "
3262 "inc %[i]; dec %[cnt]; "
3263 "jne 0b; "
3264 "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3265 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3266 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3267 : "memory");
3268 return tmp;
3269 }
3270
3271 // Multiply (unsigned) Long A by Long B, accumulating the double-
3272 // length result into the accumulator formed of T0, T1, and T2.
3273 #define MACC(A, B, T0, T1, T2) \
3274 do { \
3275 unsigned long hi, lo; \
3276 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
3277 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
3278 : "r"(A), "a"(B) : "cc"); \
3279 } while(0)
3280
3281 // As above, but add twice the double-length result into the
3282 // accumulator.
3283 #define MACC2(A, B, T0, T1, T2) \
3284 do { \
3285 unsigned long hi, lo; \
3286 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3287 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
3288 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
3289 : "r"(A), "a"(B) : "cc"); \
3290 } while(0)
3291
3292 #else //_WINDOWS
3293
3294 static julong
3295 sub(julong a[], julong b[], julong carry, long len) {
3296 long i;
3297 julong tmp;
3298 unsigned char c = 1;
3299 for (i = 0; i < len; i++) {
3300 c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3301 a[i] = tmp;
3302 }
3303 c = _addcarry_u64(c, carry, ~0, &tmp);
3304 return tmp;
3305 }
3306
3307 // Multiply (unsigned) Long A by Long B, accumulating the double-
3308 // length result into the accumulator formed of T0, T1, and T2.
3309 #define MACC(A, B, T0, T1, T2) \
3310 do { \
3311 julong hi, lo; \
3312 lo = _umul128(A, B, &hi); \
3313 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
3314 c = _addcarry_u64(c, hi, T1, &T1); \
3315 _addcarry_u64(c, T2, 0, &T2); \
3316 } while(0)
3317
3318 // As above, but add twice the double-length result into the
3319 // accumulator.
3320 #define MACC2(A, B, T0, T1, T2) \
3321 do { \
3322 julong hi, lo; \
3323 lo = _umul128(A, B, &hi); \
3324 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
3325 c = _addcarry_u64(c, hi, T1, &T1); \
3326 _addcarry_u64(c, T2, 0, &T2); \
3327 c = _addcarry_u64(0, lo, T0, &T0); \
3328 c = _addcarry_u64(c, hi, T1, &T1); \
3329 _addcarry_u64(c, T2, 0, &T2); \
3330 } while(0)
3331
3332 #endif //_WINDOWS
3333
3334 // Fast Montgomery multiplication. The derivation of the algorithm is
3335 // in A Cryptographic Library for the Motorola DSP56000,
3336 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3337
3338 static void NOINLINE
3339 montgomery_multiply(julong a[], julong b[], julong n[],
3340 julong m[], julong inv, int len) {
3341 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3342 int i;
3343
3344 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3345
3346 for (i = 0; i < len; i++) {
3347 int j;
3348 for (j = 0; j < i; j++) {
3349 MACC(a[j], b[i-j], t0, t1, t2);
3350 MACC(m[j], n[i-j], t0, t1, t2);
3351 }
3352 MACC(a[i], b[0], t0, t1, t2);
3353 m[i] = t0 * inv;
3354 MACC(m[i], n[0], t0, t1, t2);
3355
3356 assert(t0 == 0, "broken Montgomery multiply");
3357
3358 t0 = t1; t1 = t2; t2 = 0;
3359 }
3360
3361 for (i = len; i < 2*len; i++) {
3362 int j;
3363 for (j = i-len+1; j < len; j++) {
3364 MACC(a[j], b[i-j], t0, t1, t2);
3365 MACC(m[j], n[i-j], t0, t1, t2);
3366 }
3367 m[i-len] = t0;
3368 t0 = t1; t1 = t2; t2 = 0;
3369 }
3370
3371 while (t0)
3372 t0 = sub(m, n, t0, len);
3373 }
3374
3375 // Fast Montgomery squaring. This uses asymptotically 25% fewer
3376 // multiplies so it should be up to 25% faster than Montgomery
3377 // multiplication. However, its loop control is more complex and it
3378 // may actually run slower on some machines.
3379
3380 static void NOINLINE
3381 montgomery_square(julong a[], julong n[],
3382 julong m[], julong inv, int len) {
3383 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3384 int i;
3385
3386 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3387
3388 for (i = 0; i < len; i++) {
3389 int j;
3390 int end = (i+1)/2;
3391 for (j = 0; j < end; j++) {
3392 MACC2(a[j], a[i-j], t0, t1, t2);
3393 MACC(m[j], n[i-j], t0, t1, t2);
3394 }
3395 if ((i & 1) == 0) {
3396 MACC(a[j], a[j], t0, t1, t2);
3397 }
3398 for (; j < i; j++) {
3399 MACC(m[j], n[i-j], t0, t1, t2);
3400 }
3401 m[i] = t0 * inv;
3402 MACC(m[i], n[0], t0, t1, t2);
3403
3404 assert(t0 == 0, "broken Montgomery square");
3405
3406 t0 = t1; t1 = t2; t2 = 0;
3407 }
3408
3409 for (i = len; i < 2*len; i++) {
3410 int start = i-len+1;
3411 int end = start + (len - start)/2;
3412 int j;
3413 for (j = start; j < end; j++) {
3414 MACC2(a[j], a[i-j], t0, t1, t2);
3415 MACC(m[j], n[i-j], t0, t1, t2);
3416 }
3417 if ((i & 1) == 0) {
3418 MACC(a[j], a[j], t0, t1, t2);
3419 }
3420 for (; j < len; j++) {
3421 MACC(m[j], n[i-j], t0, t1, t2);
3422 }
3423 m[i-len] = t0;
3424 t0 = t1; t1 = t2; t2 = 0;
3425 }
3426
3427 while (t0)
3428 t0 = sub(m, n, t0, len);
3429 }
3430
3431 // Swap words in a longword.
3432 static julong swap(julong x) {
3433 return (x << 32) | (x >> 32);
3434 }
3435
3436 // Copy len longwords from s to d, word-swapping as we go. The
3437 // destination array is reversed.
3438 static void reverse_words(julong *s, julong *d, int len) {
3439 d += len;
3440 while(len-- > 0) {
3441 d--;
3442 *d = swap(*s);
3443 s++;
3444 }
3445 }
3446
3447 // The threshold at which squaring is advantageous was determined
3448 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3449 #define MONTGOMERY_SQUARING_THRESHOLD 64
3450
3451 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3452 jint len, jlong inv,
3453 jint *m_ints) {
3454 assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3455 int longwords = len/2;
3456
3457 // Make very sure we don't use so much space that the stack might
3458 // overflow. 512 jints corresponds to an 16384-bit integer and
3459 // will use here a total of 8k bytes of stack space.
3460 int divisor = sizeof(julong) * 4;
3461 guarantee(longwords <= 8192 / divisor, "must be");
3462 int total_allocation = longwords * sizeof (julong) * 4;
3463 julong *scratch = (julong *)alloca(total_allocation);
3464
3465 // Local scratch arrays
3466 julong
3467 *a = scratch + 0 * longwords,
3468 *b = scratch + 1 * longwords,
3469 *n = scratch + 2 * longwords,
3470 *m = scratch + 3 * longwords;
3471
3472 reverse_words((julong *)a_ints, a, longwords);
3473 reverse_words((julong *)b_ints, b, longwords);
3474 reverse_words((julong *)n_ints, n, longwords);
3475
3476 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3477
3478 reverse_words(m, (julong *)m_ints, longwords);
3479 }
3480
3481 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3482 jint len, jlong inv,
3483 jint *m_ints) {
3484 assert(len % 2 == 0, "array length in montgomery_square must be even");
3485 int longwords = len/2;
3486
3487 // Make very sure we don't use so much space that the stack might
3488 // overflow. 512 jints corresponds to an 16384-bit integer and
3489 // will use here a total of 6k bytes of stack space.
3490 int divisor = sizeof(julong) * 3;
3491 guarantee(longwords <= (8192 / divisor), "must be");
3492 int total_allocation = longwords * sizeof (julong) * 3;
3493 julong *scratch = (julong *)alloca(total_allocation);
3494
3495 // Local scratch arrays
3496 julong
3497 *a = scratch + 0 * longwords,
3498 *n = scratch + 1 * longwords,
3499 *m = scratch + 2 * longwords;
3500
3501 reverse_words((julong *)a_ints, a, longwords);
3502 reverse_words((julong *)n_ints, n, longwords);
3503
3504 if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3505 ::montgomery_square(a, n, m, (julong)inv, longwords);
3506 } else {
3507 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3508 }
3509
3510 reverse_words(m, (julong *)m_ints, longwords);
3511 }
3512
3513 #if INCLUDE_JFR
3514
3515 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
3516 // It returns a jobject handle to the event writer.
3517 // The handle is dereferenced and the return value is the event writer oop.
3518 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
3519 enum layout {
3520 rbp_off,
3521 rbpH_off,
3522 return_off,
3523 return_off2,
3524 framesize // inclusive of return address
3525 };
3526
3527 const char* name = SharedRuntime::stub_name(StubId::shared_jfr_write_checkpoint_id);
3528 CodeBuffer code(name, 1024, 64);
3529 MacroAssembler* masm = new MacroAssembler(&code);
3530 address start = __ pc();
3531
3532 __ enter();
3533 address the_pc = __ pc();
3534
3535 int frame_complete = the_pc - start;
3536
3537 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3538 __ movptr(c_rarg0, r15_thread);
3539 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
3540 __ reset_last_Java_frame(true);
3541
3542 // rax is jobject handle result, unpack and process it through a barrier.
3543 __ resolve_global_jobject(rax, c_rarg0);
3544
3545 __ leave();
3546 __ ret(0);
3547
3548 OopMapSet* oop_maps = new OopMapSet();
3549 OopMap* map = new OopMap(framesize, 1);
3550 oop_maps->add_gc_map(frame_complete, map);
3551
3552 RuntimeStub* stub =
3553 RuntimeStub::new_runtime_stub(name,
3554 &code,
3555 frame_complete,
3556 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3557 oop_maps,
3558 false);
3559 return stub;
3560 }
3561
3562 // For c2: call to return a leased buffer.
3563 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
3564 enum layout {
3565 rbp_off,
3566 rbpH_off,
3567 return_off,
3568 return_off2,
3569 framesize // inclusive of return address
3570 };
3571
3572 const char* name = SharedRuntime::stub_name(StubId::shared_jfr_return_lease_id);
3573 CodeBuffer code(name, 1024, 64);
3574 MacroAssembler* masm = new MacroAssembler(&code);
3575 address start = __ pc();
3576
3577 __ enter();
3578 address the_pc = __ pc();
3579
3580 int frame_complete = the_pc - start;
3581
3582 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
3583 __ movptr(c_rarg0, r15_thread);
3584 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
3585 __ reset_last_Java_frame(true);
3586
3587 __ leave();
3588 __ ret(0);
3589
3590 OopMapSet* oop_maps = new OopMapSet();
3591 OopMap* map = new OopMap(framesize, 1);
3592 oop_maps->add_gc_map(frame_complete, map);
3593
3594 RuntimeStub* stub =
3595 RuntimeStub::new_runtime_stub(name,
3596 &code,
3597 frame_complete,
3598 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3599 oop_maps,
3600 false);
3601 return stub;
3602 }
3603
3604 #endif // INCLUDE_JFR
3605