1 /*
2 * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #ifndef _WINDOWS
26 #include "alloca.h"
27 #endif
28 #include "asm/macroAssembler.hpp"
29 #include "asm/macroAssembler.inline.hpp"
30 #include "code/aotCodeCache.hpp"
31 #include "code/compiledIC.hpp"
32 #include "code/debugInfoRec.hpp"
33 #include "code/nativeInst.hpp"
34 #include "code/vtableStubs.hpp"
35 #include "compiler/oopMap.hpp"
36 #include "gc/shared/collectedHeap.hpp"
37 #include "gc/shared/gcLocker.hpp"
38 #include "gc/shared/barrierSet.hpp"
39 #include "gc/shared/barrierSetAssembler.hpp"
40 #include "interpreter/interpreter.hpp"
41 #include "logging/log.hpp"
42 #include "memory/resourceArea.hpp"
43 #include "memory/universe.hpp"
44 #include "oops/klass.inline.hpp"
45 #include "oops/method.inline.hpp"
46 #include "prims/methodHandles.hpp"
47 #include "runtime/continuation.hpp"
48 #include "runtime/continuationEntry.inline.hpp"
49 #include "runtime/globals.hpp"
50 #include "runtime/jniHandles.hpp"
51 #include "runtime/safepointMechanism.hpp"
52 #include "runtime/sharedRuntime.hpp"
53 #include "runtime/signature.hpp"
54 #include "runtime/stubRoutines.hpp"
55 #include "runtime/timerTrace.hpp"
56 #include "runtime/vframeArray.hpp"
57 #include "runtime/vm_version.hpp"
58 #include "utilities/align.hpp"
59 #include "utilities/checkedCast.hpp"
60 #include "utilities/formatBuffer.hpp"
61 #include "vmreg_x86.inline.hpp"
62 #ifdef COMPILER1
63 #include "c1/c1_Runtime1.hpp"
64 #endif
65 #ifdef COMPILER2
66 #include "opto/runtime.hpp"
67 #endif
68
69 #define __ masm->
70
71 #ifdef PRODUCT
72 #define BLOCK_COMMENT(str) /* nothing */
73 #else
74 #define BLOCK_COMMENT(str) __ block_comment(str)
75 #endif // PRODUCT
76
77 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
78
79 class RegisterSaver {
80 // Capture info about frame layout. Layout offsets are in jint
81 // units because compiler frame slots are jints.
82 #define XSAVE_AREA_BEGIN 160
83 #define XSAVE_AREA_YMM_BEGIN 576
84 #define XSAVE_AREA_EGPRS 960
85 #define XSAVE_AREA_OPMASK_BEGIN 1088
86 #define XSAVE_AREA_ZMM_BEGIN 1152
87 #define XSAVE_AREA_UPPERBANK 1664
88 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
89 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
90 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
91 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off
92 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
93 enum layout {
94 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
95 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area
96 DEF_XMM_OFFS(0),
97 DEF_XMM_OFFS(1),
98 // 2..15 are implied in range usage
99 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
100 DEF_YMM_OFFS(0),
101 DEF_YMM_OFFS(1),
102 r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
103 r16H_off,
104 r17_off, r17H_off,
105 r18_off, r18H_off,
106 r19_off, r19H_off,
107 r20_off, r20H_off,
108 r21_off, r21H_off,
109 r22_off, r22H_off,
110 r23_off, r23H_off,
111 r24_off, r24H_off,
112 r25_off, r25H_off,
113 r26_off, r26H_off,
114 r27_off, r27H_off,
115 r28_off, r28H_off,
116 r29_off, r29H_off,
117 r30_off, r30H_off,
118 r31_off, r31H_off,
119 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
120 DEF_OPMASK_OFFS(0),
121 DEF_OPMASK_OFFS(1),
122 // 2..7 are implied in range usage
123 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
124 DEF_ZMM_OFFS(0),
125 DEF_ZMM_OFFS(1),
126 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
127 DEF_ZMM_UPPER_OFFS(16),
128 DEF_ZMM_UPPER_OFFS(17),
129 // 18..31 are implied in range usage
130 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
131 fpu_stateH_end,
132 r15_off, r15H_off,
133 r14_off, r14H_off,
134 r13_off, r13H_off,
135 r12_off, r12H_off,
136 r11_off, r11H_off,
137 r10_off, r10H_off,
138 r9_off, r9H_off,
139 r8_off, r8H_off,
140 rdi_off, rdiH_off,
141 rsi_off, rsiH_off,
142 ignore_off, ignoreH_off, // extra copy of rbp
143 rsp_off, rspH_off,
144 rbx_off, rbxH_off,
145 rdx_off, rdxH_off,
146 rcx_off, rcxH_off,
147 rax_off, raxH_off,
148 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
149 align_off, alignH_off,
150 flags_off, flagsH_off,
151 // The frame sender code expects that rbp will be in the "natural" place and
152 // will override any oopMap setting for it. We must therefore force the layout
153 // so that it agrees with the frame sender code.
154 rbp_off, rbpH_off, // copy of rbp we will restore
155 return_off, returnH_off, // slot for return address
156 reg_save_size // size in compiler stack slots
157 };
158
159 public:
160 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
161 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
162
163 // Offsets into the register save area
164 // Used by deoptimization when it is managing result register
165 // values on its own
166
167 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; }
168 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; }
169 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; }
170 static int r15_offset_in_bytes(void) { return BytesPerInt * r15_off; }
171 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; }
172 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
173
174 // During deoptimization only the result registers need to be restored,
175 // all the other values have already been extracted.
176 static void restore_result_registers(MacroAssembler* masm);
177 };
178
179 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
180 int off = 0;
181 int num_xmm_regs = XMMRegister::available_xmm_registers();
182 #ifdef COMPILER2
183 if (save_wide_vectors && UseAVX == 0) {
184 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
185 }
186 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
187 #else
188 save_wide_vectors = false; // vectors are generated only by C2
189 #endif // COMPILER2
190
191 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
192 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
193 // OopMap frame size is in compiler stack slots (jint's) not bytes or words
194 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
195 // CodeBlob frame size is in words.
196 int frame_size_in_words = frame_size_in_bytes / wordSize;
197 *total_frame_words = frame_size_in_words;
198
199 // Save registers, fpu state, and flags.
200 // We assume caller has already pushed the return address onto the
201 // stack, so rsp is 8-byte aligned here.
202 // We push rpb twice in this sequence because we want the real rbp
203 // to be under the return like a normal enter.
204
205 __ enter(); // rsp becomes 16-byte aligned here
206 __ pushf();
207 // Make sure rsp stays 16-byte aligned
208 __ subq(rsp, 8);
209 // Push CPU state in multiple of 16 bytes
210 __ save_legacy_gprs();
211 __ push_FPU_state();
212
213
214 // push cpu state handles this on EVEX enabled targets
215 if (save_wide_vectors) {
216 // Save upper half of YMM registers(0..15)
217 int base_addr = XSAVE_AREA_YMM_BEGIN;
218 for (int n = 0; n < 16; n++) {
219 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
220 }
221 if (VM_Version::supports_evex()) {
222 // Save upper half of ZMM registers(0..15)
223 base_addr = XSAVE_AREA_ZMM_BEGIN;
224 for (int n = 0; n < 16; n++) {
225 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
226 }
227 // Save full ZMM registers(16..num_xmm_regs)
228 base_addr = XSAVE_AREA_UPPERBANK;
229 off = 0;
230 int vector_len = Assembler::AVX_512bit;
231 for (int n = 16; n < num_xmm_regs; n++) {
232 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
233 }
234 #ifdef COMPILER2
235 base_addr = XSAVE_AREA_OPMASK_BEGIN;
236 off = 0;
237 for(int n = 0; n < KRegister::number_of_registers; n++) {
238 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
239 }
240 #endif // COMPILER2
241 }
242 } else {
243 if (VM_Version::supports_evex()) {
244 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
245 int base_addr = XSAVE_AREA_UPPERBANK;
246 off = 0;
247 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit;
248 for (int n = 16; n < num_xmm_regs; n++) {
249 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
250 }
251 #ifdef COMPILER2
252 base_addr = XSAVE_AREA_OPMASK_BEGIN;
253 off = 0;
254 for(int n = 0; n < KRegister::number_of_registers; n++) {
255 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
256 }
257 #endif // COMPILER2
258 }
259 }
260
261 #ifdef COMPILER2
262 if (UseAPX) {
263 int base_addr = XSAVE_AREA_EGPRS;
264 off = 0;
265 for (int n = 16; n < Register::number_of_registers; n++) {
266 __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
267 }
268 }
269 #endif // COMPILER2
270
271 __ vzeroupper();
272 if (frame::arg_reg_save_area_bytes != 0) {
273 // Allocate argument register save area
274 __ subptr(rsp, frame::arg_reg_save_area_bytes);
275 }
276
277 // Set an oopmap for the call site. This oopmap will map all
278 // oop-registers and debug-info registers as callee-saved. This
279 // will allow deoptimization at this safepoint to find all possible
280 // debug-info recordings, as well as let GC find all oops.
281
282 OopMapSet *oop_maps = new OopMapSet();
283 OopMap* map = new OopMap(frame_size_in_slots, 0);
284
285 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
286
287 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
288 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
289 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
290 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
291 // rbp location is known implicitly by the frame sender code, needs no oopmap
292 // and the location where rbp was saved by is ignored
293 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
294 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
295 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg());
296 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg());
297 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
298 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
299 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
300 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
301 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
302 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
303
304 if (UseAPX) {
305 map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
306 map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
307 map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
308 map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
309 map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
310 map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
311 map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
312 map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
313 map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
314 map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
315 map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
316 map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
317 map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
318 map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
319 map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
320 map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
321 }
322 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
323 // on EVEX enabled targets, we get it included in the xsave area
324 off = xmm0_off;
325 int delta = xmm1_off - off;
326 for (int n = 0; n < 16; n++) {
327 XMMRegister xmm_name = as_XMMRegister(n);
328 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
329 off += delta;
330 }
331 if (UseAVX > 2) {
332 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
333 off = zmm16_off;
334 delta = zmm17_off - off;
335 for (int n = 16; n < num_xmm_regs; n++) {
336 XMMRegister zmm_name = as_XMMRegister(n);
337 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
338 off += delta;
339 }
340 }
341
342 #ifdef COMPILER2
343 if (save_wide_vectors) {
344 // Save upper half of YMM registers(0..15)
345 off = ymm0_off;
346 delta = ymm1_off - ymm0_off;
347 for (int n = 0; n < 16; n++) {
348 XMMRegister ymm_name = as_XMMRegister(n);
349 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
350 off += delta;
351 }
352 if (VM_Version::supports_evex()) {
353 // Save upper half of ZMM registers(0..15)
354 off = zmm0_off;
355 delta = zmm1_off - zmm0_off;
356 for (int n = 0; n < 16; n++) {
357 XMMRegister zmm_name = as_XMMRegister(n);
358 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
359 off += delta;
360 }
361 }
362 }
363 #endif // COMPILER2
364
365 // %%% These should all be a waste but we'll keep things as they were for now
366 if (true) {
367 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
368 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
369 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
370 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
371 // rbp location is known implicitly by the frame sender code, needs no oopmap
372 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
373 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
374 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next());
375 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next());
376 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
377 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
378 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
379 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
380 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
381 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
382 if (UseAPX) {
383 map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
384 map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
385 map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
386 map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
387 map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
388 map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
389 map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
390 map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
391 map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
392 map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
393 map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
394 map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
395 map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
396 map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
397 map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
398 map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
399 }
400 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
401 // on EVEX enabled targets, we get it included in the xsave area
402 off = xmm0H_off;
403 delta = xmm1H_off - off;
404 for (int n = 0; n < 16; n++) {
405 XMMRegister xmm_name = as_XMMRegister(n);
406 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
407 off += delta;
408 }
409 if (UseAVX > 2) {
410 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
411 off = zmm16H_off;
412 delta = zmm17H_off - off;
413 for (int n = 16; n < num_xmm_regs; n++) {
414 XMMRegister zmm_name = as_XMMRegister(n);
415 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
416 off += delta;
417 }
418 }
419 }
420
421 return map;
422 }
423
424 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
425 int num_xmm_regs = XMMRegister::available_xmm_registers();
426 if (frame::arg_reg_save_area_bytes != 0) {
427 // Pop arg register save area
428 __ addptr(rsp, frame::arg_reg_save_area_bytes);
429 }
430
431 #ifdef COMPILER2
432 if (restore_wide_vectors) {
433 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
434 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
435 }
436 #else
437 assert(!restore_wide_vectors, "vectors are generated only by C2");
438 #endif // COMPILER2
439
440 __ vzeroupper();
441
442 // On EVEX enabled targets everything is handled in pop fpu state
443 if (restore_wide_vectors) {
444 // Restore upper half of YMM registers (0..15)
445 int base_addr = XSAVE_AREA_YMM_BEGIN;
446 for (int n = 0; n < 16; n++) {
447 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
448 }
449 if (VM_Version::supports_evex()) {
450 // Restore upper half of ZMM registers (0..15)
451 base_addr = XSAVE_AREA_ZMM_BEGIN;
452 for (int n = 0; n < 16; n++) {
453 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
454 }
455 // Restore full ZMM registers(16..num_xmm_regs)
456 base_addr = XSAVE_AREA_UPPERBANK;
457 int vector_len = Assembler::AVX_512bit;
458 int off = 0;
459 for (int n = 16; n < num_xmm_regs; n++) {
460 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
461 }
462 #ifdef COMPILER2
463 base_addr = XSAVE_AREA_OPMASK_BEGIN;
464 off = 0;
465 for (int n = 0; n < KRegister::number_of_registers; n++) {
466 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
467 }
468 #endif // COMPILER2
469 }
470 } else {
471 if (VM_Version::supports_evex()) {
472 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
473 int base_addr = XSAVE_AREA_UPPERBANK;
474 int off = 0;
475 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit;
476 for (int n = 16; n < num_xmm_regs; n++) {
477 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
478 }
479 #ifdef COMPILER2
480 base_addr = XSAVE_AREA_OPMASK_BEGIN;
481 off = 0;
482 for (int n = 0; n < KRegister::number_of_registers; n++) {
483 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
484 }
485 #endif // COMPILER2
486 }
487 }
488
489 #ifdef COMPILER2
490 if (UseAPX) {
491 int base_addr = XSAVE_AREA_EGPRS;
492 int off = 0;
493 for (int n = 16; n < Register::number_of_registers; n++) {
494 __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
495 }
496 }
497 #endif // COMPILER2
498
499 // Recover CPU state
500 __ pop_FPU_state();
501 __ restore_legacy_gprs();
502 __ addq(rsp, 8);
503 __ popf();
504 // Get the rbp described implicitly by the calling convention (no oopMap)
505 __ pop(rbp);
506 }
507
508 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
509
510 // Just restore result register. Only used by deoptimization. By
511 // now any callee save register that needs to be restored to a c2
512 // caller of the deoptee has been extracted into the vframeArray
513 // and will be stuffed into the c2i adapter we create for later
514 // restoration so only result registers need to be restored here.
515
516 // Restore fp result register
517 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
518 // Restore integer result register
519 __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
520 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
521
522 // Pop all of the register save are off the stack except the return address
523 __ addptr(rsp, return_offset_in_bytes());
524 }
525
526 // Is vector's size (in bytes) bigger than a size saved by default?
527 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
528 bool SharedRuntime::is_wide_vector(int size) {
529 return size > 16;
530 }
531
532 // ---------------------------------------------------------------------------
533 // Read the array of BasicTypes from a signature, and compute where the
534 // arguments should go. Values in the VMRegPair regs array refer to 4-byte
535 // quantities. Values less than VMRegImpl::stack0 are registers, those above
536 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer
537 // as framesizes are fixed.
538 // VMRegImpl::stack0 refers to the first slot 0(sp).
539 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
540 // Register up to Register::number_of_registers are the 64-bit
541 // integer registers.
542
543 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
544 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit
545 // units regardless of build. Of course for i486 there is no 64 bit build
546
547 // The Java calling convention is a "shifted" version of the C ABI.
548 // By skipping the first C ABI register we can call non-static jni methods
549 // with small numbers of arguments without having to shuffle the arguments
550 // at all. Since we control the java ABI we ought to at least get some
551 // advantage out of it.
552
553 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
554 VMRegPair *regs,
555 int total_args_passed) {
556
557 // Create the mapping between argument positions and
558 // registers.
559 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
560 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
561 };
562 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
563 j_farg0, j_farg1, j_farg2, j_farg3,
564 j_farg4, j_farg5, j_farg6, j_farg7
565 };
566
567
568 uint int_args = 0;
569 uint fp_args = 0;
570 uint stk_args = 0;
571
572 for (int i = 0; i < total_args_passed; i++) {
573 switch (sig_bt[i]) {
574 case T_BOOLEAN:
575 case T_CHAR:
576 case T_BYTE:
577 case T_SHORT:
578 case T_INT:
579 if (int_args < Argument::n_int_register_parameters_j) {
580 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
581 } else {
582 stk_args = align_up(stk_args, 2);
583 regs[i].set1(VMRegImpl::stack2reg(stk_args));
584 stk_args += 1;
585 }
586 break;
587 case T_VOID:
588 // halves of T_LONG or T_DOUBLE
589 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
590 regs[i].set_bad();
591 break;
592 case T_LONG:
593 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
594 // fall through
595 case T_OBJECT:
596 case T_ARRAY:
597 case T_ADDRESS:
598 if (int_args < Argument::n_int_register_parameters_j) {
599 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
600 } else {
601 stk_args = align_up(stk_args, 2);
602 regs[i].set2(VMRegImpl::stack2reg(stk_args));
603 stk_args += 2;
604 }
605 break;
606 case T_FLOAT:
607 if (fp_args < Argument::n_float_register_parameters_j) {
608 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
609 } else {
610 stk_args = align_up(stk_args, 2);
611 regs[i].set1(VMRegImpl::stack2reg(stk_args));
612 stk_args += 1;
613 }
614 break;
615 case T_DOUBLE:
616 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
617 if (fp_args < Argument::n_float_register_parameters_j) {
618 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
619 } else {
620 stk_args = align_up(stk_args, 2);
621 regs[i].set2(VMRegImpl::stack2reg(stk_args));
622 stk_args += 2;
623 }
624 break;
625 default:
626 ShouldNotReachHere();
627 break;
628 }
629 }
630
631 return stk_args;
632 }
633
634 // Patch the callers callsite with entry to compiled code if it exists.
635 static void patch_callers_callsite(MacroAssembler *masm) {
636 Label L;
637 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
638 __ jcc(Assembler::equal, L);
639
640 // Save the current stack pointer
641 __ mov(r13, rsp);
642 // Schedule the branch target address early.
643 // Call into the VM to patch the caller, then jump to compiled callee
644 // rax isn't live so capture return address while we easily can
645 __ movptr(rax, Address(rsp, 0));
646
647 // align stack so push_CPU_state doesn't fault
648 __ andptr(rsp, -(StackAlignmentInBytes));
649 __ push_CPU_state();
650 __ vzeroupper();
651 // VM needs caller's callsite
652 // VM needs target method
653 // This needs to be a long call since we will relocate this adapter to
654 // the codeBuffer and it may not reach
655
656 // Allocate argument register save area
657 if (frame::arg_reg_save_area_bytes != 0) {
658 __ subptr(rsp, frame::arg_reg_save_area_bytes);
659 }
660 __ mov(c_rarg0, rbx);
661 __ mov(c_rarg1, rax);
662 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
663
664 // De-allocate argument register save area
665 if (frame::arg_reg_save_area_bytes != 0) {
666 __ addptr(rsp, frame::arg_reg_save_area_bytes);
667 }
668
669 __ vzeroupper();
670 __ pop_CPU_state();
671 // restore sp
672 __ mov(rsp, r13);
673 __ bind(L);
674 }
675
676 static void gen_c2i_adapter(MacroAssembler *masm,
677 int total_args_passed,
678 int comp_args_on_stack,
679 const BasicType *sig_bt,
680 const VMRegPair *regs,
681 Label& skip_fixup) {
682 // Before we get into the guts of the C2I adapter, see if we should be here
683 // at all. We've come from compiled code and are attempting to jump to the
684 // interpreter, which means the caller made a static call to get here
685 // (vcalls always get a compiled target if there is one). Check for a
686 // compiled target. If there is one, we need to patch the caller's call.
687 patch_callers_callsite(masm);
688
689 __ bind(skip_fixup);
690
691 // Since all args are passed on the stack, total_args_passed *
692 // Interpreter::stackElementSize is the space we need.
693
694 assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
695
696 int extraspace = (total_args_passed * Interpreter::stackElementSize);
697
698 // stack is aligned, keep it that way
699 // This is not currently needed or enforced by the interpreter, but
700 // we might as well conform to the ABI.
701 extraspace = align_up(extraspace, 2*wordSize);
702
703 // set senderSP value
704 __ lea(r13, Address(rsp, wordSize));
705
706 #ifdef ASSERT
707 __ check_stack_alignment(r13, "sender stack not aligned");
708 #endif
709 if (extraspace > 0) {
710 // Pop the return address
711 __ pop(rax);
712
713 __ subptr(rsp, extraspace);
714
715 // Push the return address
716 __ push(rax);
717
718 // Account for the return address location since we store it first rather
719 // than hold it in a register across all the shuffling
720 extraspace += wordSize;
721 }
722
723 #ifdef ASSERT
724 __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
725 #endif
726
727 // Now write the args into the outgoing interpreter space
728 for (int i = 0; i < total_args_passed; i++) {
729 if (sig_bt[i] == T_VOID) {
730 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
731 continue;
732 }
733
734 // offset to start parameters
735 int st_off = (total_args_passed - i) * Interpreter::stackElementSize;
736 int next_off = st_off - Interpreter::stackElementSize;
737
738 // Say 4 args:
739 // i st_off
740 // 0 32 T_LONG
741 // 1 24 T_VOID
742 // 2 16 T_OBJECT
743 // 3 8 T_BOOL
744 // - 0 return address
745 //
746 // However to make thing extra confusing. Because we can fit a long/double in
747 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
748 // leaves one slot empty and only stores to a single slot. In this case the
749 // slot that is occupied is the T_VOID slot. See I said it was confusing.
750
751 VMReg r_1 = regs[i].first();
752 VMReg r_2 = regs[i].second();
753 if (!r_1->is_valid()) {
754 assert(!r_2->is_valid(), "");
755 continue;
756 }
757 if (r_1->is_stack()) {
758 // memory to memory use rax
759 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
760 if (!r_2->is_valid()) {
761 // sign extend??
762 __ movl(rax, Address(rsp, ld_off));
763 __ movptr(Address(rsp, st_off), rax);
764
765 } else {
766
767 __ movq(rax, Address(rsp, ld_off));
768
769 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
770 // T_DOUBLE and T_LONG use two slots in the interpreter
771 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
772 // ld_off == LSW, ld_off+wordSize == MSW
773 // st_off == MSW, next_off == LSW
774 __ movq(Address(rsp, next_off), rax);
775 #ifdef ASSERT
776 // Overwrite the unused slot with known junk
777 __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
778 __ movptr(Address(rsp, st_off), rax);
779 #endif /* ASSERT */
780 } else {
781 __ movq(Address(rsp, st_off), rax);
782 }
783 }
784 } else if (r_1->is_Register()) {
785 Register r = r_1->as_Register();
786 if (!r_2->is_valid()) {
787 // must be only an int (or less ) so move only 32bits to slot
788 // why not sign extend??
789 __ movl(Address(rsp, st_off), r);
790 } else {
791 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
792 // T_DOUBLE and T_LONG use two slots in the interpreter
793 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
794 // long/double in gpr
795 #ifdef ASSERT
796 // Overwrite the unused slot with known junk
797 __ mov64(rax, CONST64(0xdeadffffdeadaaab));
798 __ movptr(Address(rsp, st_off), rax);
799 #endif /* ASSERT */
800 __ movq(Address(rsp, next_off), r);
801 } else {
802 __ movptr(Address(rsp, st_off), r);
803 }
804 }
805 } else {
806 assert(r_1->is_XMMRegister(), "");
807 if (!r_2->is_valid()) {
808 // only a float use just part of the slot
809 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
810 } else {
811 #ifdef ASSERT
812 // Overwrite the unused slot with known junk
813 __ mov64(rax, CONST64(0xdeadffffdeadaaac));
814 __ movptr(Address(rsp, st_off), rax);
815 #endif /* ASSERT */
816 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
817 }
818 }
819 }
820
821 // Schedule the branch target address early.
822 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
823 __ jmp(rcx);
824 }
825
826 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
827 int total_args_passed,
828 int comp_args_on_stack,
829 const BasicType *sig_bt,
830 const VMRegPair *regs) {
831
832 // Note: r13 contains the senderSP on entry. We must preserve it since
833 // we may do a i2c -> c2i transition if we lose a race where compiled
834 // code goes non-entrant while we get args ready.
835 // In addition we use r13 to locate all the interpreter args as
836 // we must align the stack to 16 bytes on an i2c entry else we
837 // lose alignment we expect in all compiled code and register
838 // save code can segv when fxsave instructions find improperly
839 // aligned stack pointer.
840
841 // Adapters can be frameless because they do not require the caller
842 // to perform additional cleanup work, such as correcting the stack pointer.
843 // An i2c adapter is frameless because the *caller* frame, which is interpreted,
844 // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
845 // even if a callee has modified the stack pointer.
846 // A c2i adapter is frameless because the *callee* frame, which is interpreted,
847 // routinely repairs its caller's stack pointer (from sender_sp, which is set
848 // up via the senderSP register).
849 // In other words, if *either* the caller or callee is interpreted, we can
850 // get the stack pointer repaired after a call.
851 // This is why c2i and i2c adapters cannot be indefinitely composed.
852 // In particular, if a c2i adapter were to somehow call an i2c adapter,
853 // both caller and callee would be compiled methods, and neither would
854 // clean up the stack pointer changes performed by the two adapters.
855 // If this happens, control eventually transfers back to the compiled
856 // caller, but with an uncorrected stack, causing delayed havoc.
857
858 // Must preserve original SP for loading incoming arguments because
859 // we need to align the outgoing SP for compiled code.
860 __ movptr(r11, rsp);
861
862 // Pick up the return address
863 __ pop(rax);
864
865 // Convert 4-byte c2 stack slots to words.
866 int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
867
868 if (comp_args_on_stack) {
869 __ subptr(rsp, comp_words_on_stack * wordSize);
870 }
871
872 // Ensure compiled code always sees stack at proper alignment
873 __ andptr(rsp, -16);
874
875 // push the return address and misalign the stack that youngest frame always sees
876 // as far as the placement of the call instruction
877 __ push(rax);
878
879 // Put saved SP in another register
880 const Register saved_sp = rax;
881 __ movptr(saved_sp, r11);
882
883 // Will jump to the compiled code just as if compiled code was doing it.
884 // Pre-load the register-jump target early, to schedule it better.
885 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
886
887 // Now generate the shuffle code. Pick up all register args and move the
888 // rest through the floating point stack top.
889 for (int i = 0; i < total_args_passed; i++) {
890 if (sig_bt[i] == T_VOID) {
891 // Longs and doubles are passed in native word order, but misaligned
892 // in the 32-bit build.
893 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
894 continue;
895 }
896
897 // Pick up 0, 1 or 2 words from SP+offset.
898
899 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
900 "scrambled load targets?");
901 // Load in argument order going down.
902 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
903 // Point to interpreter value (vs. tag)
904 int next_off = ld_off - Interpreter::stackElementSize;
905 //
906 //
907 //
908 VMReg r_1 = regs[i].first();
909 VMReg r_2 = regs[i].second();
910 if (!r_1->is_valid()) {
911 assert(!r_2->is_valid(), "");
912 continue;
913 }
914 if (r_1->is_stack()) {
915 // Convert stack slot to an SP offset (+ wordSize to account for return address )
916 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
917
918 // We can use r13 as a temp here because compiled code doesn't need r13 as an input
919 // and if we end up going thru a c2i because of a miss a reasonable value of r13
920 // will be generated.
921 if (!r_2->is_valid()) {
922 // sign extend???
923 __ movl(r13, Address(saved_sp, ld_off));
924 __ movptr(Address(rsp, st_off), r13);
925 } else {
926 //
927 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
928 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
929 // So we must adjust where to pick up the data to match the interpreter.
930 //
931 // Interpreter local[n] == MSW, local[n+1] == LSW however locals
932 // are accessed as negative so LSW is at LOW address
933
934 // ld_off is MSW so get LSW
935 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
936 next_off : ld_off;
937 __ movq(r13, Address(saved_sp, offset));
938 // st_off is LSW (i.e. reg.first())
939 __ movq(Address(rsp, st_off), r13);
940 }
941 } else if (r_1->is_Register()) { // Register argument
942 Register r = r_1->as_Register();
943 assert(r != rax, "must be different");
944 if (r_2->is_valid()) {
945 //
946 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
947 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
948 // So we must adjust where to pick up the data to match the interpreter.
949
950 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
951 next_off : ld_off;
952
953 // this can be a misaligned move
954 __ movq(r, Address(saved_sp, offset));
955 } else {
956 // sign extend and use a full word?
957 __ movl(r, Address(saved_sp, ld_off));
958 }
959 } else {
960 if (!r_2->is_valid()) {
961 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
962 } else {
963 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
964 }
965 }
966 }
967
968 __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
969
970 // 6243940 We might end up in handle_wrong_method if
971 // the callee is deoptimized as we race thru here. If that
972 // happens we don't want to take a safepoint because the
973 // caller frame will look interpreted and arguments are now
974 // "compiled" so it is much better to make this transition
975 // invisible to the stack walking code. Unfortunately if
976 // we try and find the callee by normal means a safepoint
977 // is possible. So we stash the desired callee in the thread
978 // and the vm will find there should this case occur.
979
980 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
981
982 // put Method* where a c2i would expect should we end up there
983 // only needed because eof c2 resolve stubs return Method* as a result in
984 // rax
985 __ mov(rax, rbx);
986 __ jmp(r11);
987 }
988
989 // ---------------------------------------------------------------
990 void SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
991 int total_args_passed,
992 int comp_args_on_stack,
993 const BasicType *sig_bt,
994 const VMRegPair *regs,
995 address entry_address[AdapterBlob::ENTRY_COUNT]) {
996 entry_address[AdapterBlob::I2C] = __ pc();
997
998 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
999
1000 // -------------------------------------------------------------------------
1001 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls
1002 // to the interpreter. The args start out packed in the compiled layout. They
1003 // need to be unpacked into the interpreter layout. This will almost always
1004 // require some stack space. We grow the current (compiled) stack, then repack
1005 // the args. We finally end in a jump to the generic interpreter entry point.
1006 // On exit from the interpreter, the interpreter will restore our SP (lest the
1007 // compiled code, which relies solely on SP and not RBP, get sick).
1008
1009 entry_address[AdapterBlob::C2I_Unverified] = __ pc();
1010 Label skip_fixup;
1011
1012 Register data = rax;
1013 Register receiver = j_rarg0;
1014 Register temp = rbx;
1015
1016 {
1017 __ ic_check(1 /* end_alignment */);
1018 __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1019 // Method might have been compiled since the call site was patched to
1020 // interpreted if that is the case treat it as a miss so we can get
1021 // the call site corrected.
1022 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1023 __ jcc(Assembler::equal, skip_fixup);
1024 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1025 }
1026
1027 entry_address[AdapterBlob::C2I] = __ pc();
1028
1029 // Class initialization barrier for static methods
1030 entry_address[AdapterBlob::C2I_No_Clinit_Check] = nullptr;
1031 assert(VM_Version::supports_fast_class_init_checks(), "sanity");
1032 Label L_skip_barrier;
1033 Register method = rbx;
1034
1035 // Bypass the barrier for non-static methods
1036 Register flags = rscratch1;
1037 __ load_unsigned_short(flags, Address(method, Method::access_flags_offset()));
1038 __ testl(flags, JVM_ACC_STATIC);
1039 __ jcc(Assembler::zero, L_skip_barrier); // non-static
1040
1041 Register klass = rscratch1;
1042 __ load_method_holder(klass, method);
1043 __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
1044
1045 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1046
1047 __ bind(L_skip_barrier);
1048 entry_address[AdapterBlob::C2I_No_Clinit_Check] = __ pc();
1049
1050 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1051 bs->c2i_entry_barrier(masm);
1052
1053 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1054 return;
1055 }
1056
1057 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1058 VMRegPair *regs,
1059 int total_args_passed) {
1060
1061 // We return the amount of VMRegImpl stack slots we need to reserve for all
1062 // the arguments NOT counting out_preserve_stack_slots.
1063
1064 // NOTE: These arrays will have to change when c1 is ported
1065 #ifdef _WIN64
1066 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1067 c_rarg0, c_rarg1, c_rarg2, c_rarg3
1068 };
1069 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1070 c_farg0, c_farg1, c_farg2, c_farg3
1071 };
1072 #else
1073 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1074 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1075 };
1076 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1077 c_farg0, c_farg1, c_farg2, c_farg3,
1078 c_farg4, c_farg5, c_farg6, c_farg7
1079 };
1080 #endif // _WIN64
1081
1082
1083 uint int_args = 0;
1084 uint fp_args = 0;
1085 uint stk_args = 0; // inc by 2 each time
1086
1087 for (int i = 0; i < total_args_passed; i++) {
1088 switch (sig_bt[i]) {
1089 case T_BOOLEAN:
1090 case T_CHAR:
1091 case T_BYTE:
1092 case T_SHORT:
1093 case T_INT:
1094 if (int_args < Argument::n_int_register_parameters_c) {
1095 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1096 #ifdef _WIN64
1097 fp_args++;
1098 // Allocate slots for callee to stuff register args the stack.
1099 stk_args += 2;
1100 #endif
1101 } else {
1102 regs[i].set1(VMRegImpl::stack2reg(stk_args));
1103 stk_args += 2;
1104 }
1105 break;
1106 case T_LONG:
1107 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1108 // fall through
1109 case T_OBJECT:
1110 case T_ARRAY:
1111 case T_ADDRESS:
1112 case T_METADATA:
1113 if (int_args < Argument::n_int_register_parameters_c) {
1114 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1115 #ifdef _WIN64
1116 fp_args++;
1117 stk_args += 2;
1118 #endif
1119 } else {
1120 regs[i].set2(VMRegImpl::stack2reg(stk_args));
1121 stk_args += 2;
1122 }
1123 break;
1124 case T_FLOAT:
1125 if (fp_args < Argument::n_float_register_parameters_c) {
1126 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1127 #ifdef _WIN64
1128 int_args++;
1129 // Allocate slots for callee to stuff register args the stack.
1130 stk_args += 2;
1131 #endif
1132 } else {
1133 regs[i].set1(VMRegImpl::stack2reg(stk_args));
1134 stk_args += 2;
1135 }
1136 break;
1137 case T_DOUBLE:
1138 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1139 if (fp_args < Argument::n_float_register_parameters_c) {
1140 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1141 #ifdef _WIN64
1142 int_args++;
1143 // Allocate slots for callee to stuff register args the stack.
1144 stk_args += 2;
1145 #endif
1146 } else {
1147 regs[i].set2(VMRegImpl::stack2reg(stk_args));
1148 stk_args += 2;
1149 }
1150 break;
1151 case T_VOID: // Halves of longs and doubles
1152 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1153 regs[i].set_bad();
1154 break;
1155 default:
1156 ShouldNotReachHere();
1157 break;
1158 }
1159 }
1160 #ifdef _WIN64
1161 // windows abi requires that we always allocate enough stack space
1162 // for 4 64bit registers to be stored down.
1163 if (stk_args < 8) {
1164 stk_args = 8;
1165 }
1166 #endif // _WIN64
1167
1168 return stk_args;
1169 }
1170
1171 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1172 uint num_bits,
1173 uint total_args_passed) {
1174 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1175 "only certain vector sizes are supported for now");
1176
1177 static const XMMRegister VEC_ArgReg[32] = {
1178 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
1179 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1180 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1181 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1182 };
1183
1184 uint stk_args = 0;
1185 uint fp_args = 0;
1186
1187 for (uint i = 0; i < total_args_passed; i++) {
1188 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1189 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15));
1190 regs[i].set_pair(vmreg->next(next_val), vmreg);
1191 }
1192
1193 return stk_args;
1194 }
1195
1196 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1197 // We always ignore the frame_slots arg and just use the space just below frame pointer
1198 // which by this time is free to use
1199 switch (ret_type) {
1200 case T_FLOAT:
1201 __ movflt(Address(rbp, -wordSize), xmm0);
1202 break;
1203 case T_DOUBLE:
1204 __ movdbl(Address(rbp, -wordSize), xmm0);
1205 break;
1206 case T_VOID: break;
1207 default: {
1208 __ movptr(Address(rbp, -wordSize), rax);
1209 }
1210 }
1211 }
1212
1213 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1214 // We always ignore the frame_slots arg and just use the space just below frame pointer
1215 // which by this time is free to use
1216 switch (ret_type) {
1217 case T_FLOAT:
1218 __ movflt(xmm0, Address(rbp, -wordSize));
1219 break;
1220 case T_DOUBLE:
1221 __ movdbl(xmm0, Address(rbp, -wordSize));
1222 break;
1223 case T_VOID: break;
1224 default: {
1225 __ movptr(rax, Address(rbp, -wordSize));
1226 }
1227 }
1228 }
1229
1230 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1231 for ( int i = first_arg ; i < arg_count ; i++ ) {
1232 if (args[i].first()->is_Register()) {
1233 __ push(args[i].first()->as_Register());
1234 } else if (args[i].first()->is_XMMRegister()) {
1235 __ subptr(rsp, 2*wordSize);
1236 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1237 }
1238 }
1239 }
1240
1241 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1242 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1243 if (args[i].first()->is_Register()) {
1244 __ pop(args[i].first()->as_Register());
1245 } else if (args[i].first()->is_XMMRegister()) {
1246 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1247 __ addptr(rsp, 2*wordSize);
1248 }
1249 }
1250 }
1251
1252 static void verify_oop_args(MacroAssembler* masm,
1253 const methodHandle& method,
1254 const BasicType* sig_bt,
1255 const VMRegPair* regs) {
1256 Register temp_reg = rbx; // not part of any compiled calling seq
1257 if (VerifyOops) {
1258 for (int i = 0; i < method->size_of_parameters(); i++) {
1259 if (is_reference_type(sig_bt[i])) {
1260 VMReg r = regs[i].first();
1261 assert(r->is_valid(), "bad oop arg");
1262 if (r->is_stack()) {
1263 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1264 __ verify_oop(temp_reg);
1265 } else {
1266 __ verify_oop(r->as_Register());
1267 }
1268 }
1269 }
1270 }
1271 }
1272
1273 static void check_continuation_enter_argument(VMReg actual_vmreg,
1274 Register expected_reg,
1275 const char* name) {
1276 assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1277 assert(actual_vmreg->as_Register() == expected_reg,
1278 "%s is in unexpected register: %s instead of %s",
1279 name, actual_vmreg->as_Register()->name(), expected_reg->name());
1280 }
1281
1282
1283 //---------------------------- continuation_enter_setup ---------------------------
1284 //
1285 // Arguments:
1286 // None.
1287 //
1288 // Results:
1289 // rsp: pointer to blank ContinuationEntry
1290 //
1291 // Kills:
1292 // rax
1293 //
1294 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1295 assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1296 assert(in_bytes(ContinuationEntry::cont_offset()) % VMRegImpl::stack_slot_size == 0, "");
1297 assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1298
1299 stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1300 __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1301
1302 int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1303 OopMap* map = new OopMap(frame_size, 0);
1304
1305 __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1306 __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1307 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1308
1309 return map;
1310 }
1311
1312 //---------------------------- fill_continuation_entry ---------------------------
1313 //
1314 // Arguments:
1315 // rsp: pointer to blank Continuation entry
1316 // reg_cont_obj: pointer to the continuation
1317 // reg_flags: flags
1318 //
1319 // Results:
1320 // rsp: pointer to filled out ContinuationEntry
1321 //
1322 // Kills:
1323 // rax
1324 //
1325 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1326 assert_different_registers(rax, reg_cont_obj, reg_flags);
1327 #ifdef ASSERT
1328 __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1329 #endif
1330 __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1331 __ movl (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1332 __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1333 __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1334 __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1335
1336 __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1337 __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1338
1339 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1340 }
1341
1342 //---------------------------- continuation_enter_cleanup ---------------------------
1343 //
1344 // Arguments:
1345 // rsp: pointer to the ContinuationEntry
1346 //
1347 // Results:
1348 // rsp: pointer to the spilled rbp in the entry frame
1349 //
1350 // Kills:
1351 // rbx
1352 //
1353 static void continuation_enter_cleanup(MacroAssembler* masm) {
1354 #ifdef ASSERT
1355 Label L_good_sp;
1356 __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1357 __ jcc(Assembler::equal, L_good_sp);
1358 __ stop("Incorrect rsp at continuation_enter_cleanup");
1359 __ bind(L_good_sp);
1360 #endif
1361 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1362 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1363 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1364 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1365 __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1366 }
1367
1368 static void gen_continuation_enter(MacroAssembler* masm,
1369 const VMRegPair* regs,
1370 int& exception_offset,
1371 OopMapSet* oop_maps,
1372 int& frame_complete,
1373 int& stack_slots,
1374 int& interpreted_entry_offset,
1375 int& compiled_entry_offset) {
1376
1377 // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1378 int pos_cont_obj = 0;
1379 int pos_is_cont = 1;
1380 int pos_is_virtual = 2;
1381
1382 // The platform-specific calling convention may present the arguments in various registers.
1383 // To simplify the rest of the code, we expect the arguments to reside at these known
1384 // registers, and we additionally check the placement here in case calling convention ever
1385 // changes.
1386 Register reg_cont_obj = c_rarg1;
1387 Register reg_is_cont = c_rarg2;
1388 Register reg_is_virtual = c_rarg3;
1389
1390 check_continuation_enter_argument(regs[pos_cont_obj].first(), reg_cont_obj, "Continuation object");
1391 check_continuation_enter_argument(regs[pos_is_cont].first(), reg_is_cont, "isContinue");
1392 check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1393
1394 // Utility methods kill rax, make sure there are no collisions
1395 assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1396
1397 AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1398 relocInfo::static_call_type);
1399
1400 address start = __ pc();
1401
1402 Label L_thaw, L_exit;
1403
1404 // i2i entry used at interp_only_mode only
1405 interpreted_entry_offset = __ pc() - start;
1406 {
1407 #ifdef ASSERT
1408 Label is_interp_only;
1409 __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1410 __ jcc(Assembler::notEqual, is_interp_only);
1411 __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1412 __ bind(is_interp_only);
1413 #endif
1414
1415 __ pop(rax); // return address
1416 // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1417 __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1418 __ movl(c_rarg2, Address(rsp, Interpreter::stackElementSize*1));
1419 __ movl(c_rarg3, Address(rsp, Interpreter::stackElementSize*0));
1420 __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1421 __ push(rax); // return address
1422 __ push_cont_fastpath();
1423
1424 __ enter();
1425
1426 stack_slots = 2; // will be adjusted in setup
1427 OopMap* map = continuation_enter_setup(masm, stack_slots);
1428 // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1429 // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1430
1431 __ verify_oop(reg_cont_obj);
1432
1433 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1434
1435 // If continuation, call to thaw. Otherwise, resolve the call and exit.
1436 __ testptr(reg_is_cont, reg_is_cont);
1437 __ jcc(Assembler::notZero, L_thaw);
1438
1439 // --- Resolve path
1440
1441 // Make sure the call is patchable
1442 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1443 // Emit stub for static call
1444 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1445 if (stub == nullptr) {
1446 fatal("CodeCache is full at gen_continuation_enter");
1447 }
1448 __ call(resolve);
1449 oop_maps->add_gc_map(__ pc() - start, map);
1450 __ post_call_nop();
1451
1452 __ jmp(L_exit);
1453 }
1454
1455 // compiled entry
1456 __ align(CodeEntryAlignment);
1457 compiled_entry_offset = __ pc() - start;
1458 __ enter();
1459
1460 stack_slots = 2; // will be adjusted in setup
1461 OopMap* map = continuation_enter_setup(masm, stack_slots);
1462
1463 // Frame is now completed as far as size and linkage.
1464 frame_complete = __ pc() - start;
1465
1466 __ verify_oop(reg_cont_obj);
1467
1468 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1469
1470 // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1471 __ testptr(reg_is_cont, reg_is_cont);
1472 __ jccb(Assembler::notZero, L_thaw);
1473
1474 // --- call Continuation.enter(Continuation c, boolean isContinue)
1475
1476 // Make sure the call is patchable
1477 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1478
1479 // Emit stub for static call
1480 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1481 if (stub == nullptr) {
1482 fatal("CodeCache is full at gen_continuation_enter");
1483 }
1484
1485 // The call needs to be resolved. There's a special case for this in
1486 // SharedRuntime::find_callee_info_helper() which calls
1487 // LinkResolver::resolve_continuation_enter() which resolves the call to
1488 // Continuation.enter(Continuation c, boolean isContinue).
1489 __ call(resolve);
1490
1491 oop_maps->add_gc_map(__ pc() - start, map);
1492 __ post_call_nop();
1493
1494 __ jmpb(L_exit);
1495
1496 // --- Thawing path
1497
1498 __ bind(L_thaw);
1499
1500 ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1501 __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1502
1503 ContinuationEntry::_return_pc_offset = __ pc() - start;
1504 oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1505 __ post_call_nop();
1506
1507 // --- Normal exit (resolve/thawing)
1508
1509 __ bind(L_exit);
1510 ContinuationEntry::_cleanup_offset = __ pc() - start;
1511 continuation_enter_cleanup(masm);
1512 __ pop(rbp);
1513 __ ret(0);
1514
1515 // --- Exception handling path
1516
1517 exception_offset = __ pc() - start;
1518
1519 continuation_enter_cleanup(masm);
1520 __ pop(rbp);
1521
1522 __ movptr(c_rarg0, r15_thread);
1523 __ movptr(c_rarg1, Address(rsp, 0)); // return address
1524
1525 // rax still holds the original exception oop, save it before the call
1526 __ push(rax);
1527
1528 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1529 __ movptr(rbx, rax);
1530
1531 // Continue at exception handler:
1532 // rax: exception oop
1533 // rbx: exception handler
1534 // rdx: exception pc
1535 __ pop(rax);
1536 __ verify_oop(rax);
1537 __ pop(rdx);
1538 __ jmp(rbx);
1539 }
1540
1541 static void gen_continuation_yield(MacroAssembler* masm,
1542 const VMRegPair* regs,
1543 OopMapSet* oop_maps,
1544 int& frame_complete,
1545 int& stack_slots,
1546 int& compiled_entry_offset) {
1547 enum layout {
1548 rbp_off,
1549 rbpH_off,
1550 return_off,
1551 return_off2,
1552 framesize // inclusive of return address
1553 };
1554 stack_slots = framesize / VMRegImpl::slots_per_word;
1555 assert(stack_slots == 2, "recheck layout");
1556
1557 address start = __ pc();
1558 compiled_entry_offset = __ pc() - start;
1559 __ enter();
1560 address the_pc = __ pc();
1561
1562 frame_complete = the_pc - start;
1563
1564 // This nop must be exactly at the PC we push into the frame info.
1565 // We use this nop for fast CodeBlob lookup, associate the OopMap
1566 // with it right away.
1567 __ post_call_nop();
1568 OopMap* map = new OopMap(framesize, 1);
1569 oop_maps->add_gc_map(frame_complete, map);
1570
1571 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1572 __ movptr(c_rarg0, r15_thread);
1573 __ movptr(c_rarg1, rsp);
1574 __ call_VM_leaf(Continuation::freeze_entry(), 2);
1575 __ reset_last_Java_frame(true);
1576
1577 Label L_pinned;
1578
1579 __ testptr(rax, rax);
1580 __ jcc(Assembler::notZero, L_pinned);
1581
1582 __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1583 continuation_enter_cleanup(masm);
1584 __ pop(rbp);
1585 __ ret(0);
1586
1587 __ bind(L_pinned);
1588
1589 // Pinned, return to caller
1590
1591 // handle pending exception thrown by freeze
1592 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1593 Label ok;
1594 __ jcc(Assembler::equal, ok);
1595 __ leave();
1596 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1597 __ bind(ok);
1598
1599 __ leave();
1600 __ ret(0);
1601 }
1602
1603 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1604 ::continuation_enter_cleanup(masm);
1605 }
1606
1607 static void gen_special_dispatch(MacroAssembler* masm,
1608 const methodHandle& method,
1609 const BasicType* sig_bt,
1610 const VMRegPair* regs) {
1611 verify_oop_args(masm, method, sig_bt, regs);
1612 vmIntrinsics::ID iid = method->intrinsic_id();
1613
1614 // Now write the args into the outgoing interpreter space
1615 bool has_receiver = false;
1616 Register receiver_reg = noreg;
1617 int member_arg_pos = -1;
1618 Register member_reg = noreg;
1619 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1620 if (ref_kind != 0) {
1621 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument
1622 member_reg = rbx; // known to be free at this point
1623 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1624 } else if (iid == vmIntrinsics::_invokeBasic) {
1625 has_receiver = true;
1626 } else if (iid == vmIntrinsics::_linkToNative) {
1627 member_arg_pos = method->size_of_parameters() - 1; // trailing NativeEntryPoint argument
1628 member_reg = rbx; // known to be free at this point
1629 } else {
1630 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1631 }
1632
1633 if (member_reg != noreg) {
1634 // Load the member_arg into register, if necessary.
1635 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1636 VMReg r = regs[member_arg_pos].first();
1637 if (r->is_stack()) {
1638 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1639 } else {
1640 // no data motion is needed
1641 member_reg = r->as_Register();
1642 }
1643 }
1644
1645 if (has_receiver) {
1646 // Make sure the receiver is loaded into a register.
1647 assert(method->size_of_parameters() > 0, "oob");
1648 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1649 VMReg r = regs[0].first();
1650 assert(r->is_valid(), "bad receiver arg");
1651 if (r->is_stack()) {
1652 // Porting note: This assumes that compiled calling conventions always
1653 // pass the receiver oop in a register. If this is not true on some
1654 // platform, pick a temp and load the receiver from stack.
1655 fatal("receiver always in a register");
1656 receiver_reg = j_rarg0; // known to be free at this point
1657 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1658 } else {
1659 // no data motion is needed
1660 receiver_reg = r->as_Register();
1661 }
1662 }
1663
1664 // Figure out which address we are really jumping to:
1665 MethodHandles::generate_method_handle_dispatch(masm, iid,
1666 receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1667 }
1668
1669 // ---------------------------------------------------------------------------
1670 // Generate a native wrapper for a given method. The method takes arguments
1671 // in the Java compiled code convention, marshals them to the native
1672 // convention (handlizes oops, etc), transitions to native, makes the call,
1673 // returns to java state (possibly blocking), unhandlizes any result and
1674 // returns.
1675 //
1676 // Critical native functions are a shorthand for the use of
1677 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1678 // functions. The wrapper is expected to unpack the arguments before
1679 // passing them to the callee. Critical native functions leave the state _in_Java,
1680 // since they cannot stop for GC.
1681 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1682 // block and the check for pending exceptions it's impossible for them
1683 // to be thrown.
1684 //
1685 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1686 const methodHandle& method,
1687 int compile_id,
1688 BasicType* in_sig_bt,
1689 VMRegPair* in_regs,
1690 BasicType ret_type) {
1691 if (method->is_continuation_native_intrinsic()) {
1692 int exception_offset = -1;
1693 OopMapSet* oop_maps = new OopMapSet();
1694 int frame_complete = -1;
1695 int stack_slots = -1;
1696 int interpreted_entry_offset = -1;
1697 int vep_offset = -1;
1698 if (method->is_continuation_enter_intrinsic()) {
1699 gen_continuation_enter(masm,
1700 in_regs,
1701 exception_offset,
1702 oop_maps,
1703 frame_complete,
1704 stack_slots,
1705 interpreted_entry_offset,
1706 vep_offset);
1707 } else if (method->is_continuation_yield_intrinsic()) {
1708 gen_continuation_yield(masm,
1709 in_regs,
1710 oop_maps,
1711 frame_complete,
1712 stack_slots,
1713 vep_offset);
1714 } else {
1715 guarantee(false, "Unknown Continuation native intrinsic");
1716 }
1717
1718 #ifdef ASSERT
1719 if (method->is_continuation_enter_intrinsic()) {
1720 assert(interpreted_entry_offset != -1, "Must be set");
1721 assert(exception_offset != -1, "Must be set");
1722 } else {
1723 assert(interpreted_entry_offset == -1, "Must be unset");
1724 assert(exception_offset == -1, "Must be unset");
1725 }
1726 assert(frame_complete != -1, "Must be set");
1727 assert(stack_slots != -1, "Must be set");
1728 assert(vep_offset != -1, "Must be set");
1729 #endif
1730
1731 __ flush();
1732 nmethod* nm = nmethod::new_native_nmethod(method,
1733 compile_id,
1734 masm->code(),
1735 vep_offset,
1736 frame_complete,
1737 stack_slots,
1738 in_ByteSize(-1),
1739 in_ByteSize(-1),
1740 oop_maps,
1741 exception_offset);
1742 if (nm == nullptr) return nm;
1743 if (method->is_continuation_enter_intrinsic()) {
1744 ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1745 } else if (method->is_continuation_yield_intrinsic()) {
1746 _cont_doYield_stub = nm;
1747 }
1748 return nm;
1749 }
1750
1751 if (method->is_method_handle_intrinsic()) {
1752 vmIntrinsics::ID iid = method->intrinsic_id();
1753 intptr_t start = (intptr_t)__ pc();
1754 int vep_offset = ((intptr_t)__ pc()) - start;
1755 gen_special_dispatch(masm,
1756 method,
1757 in_sig_bt,
1758 in_regs);
1759 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period
1760 __ flush();
1761 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually
1762 return nmethod::new_native_nmethod(method,
1763 compile_id,
1764 masm->code(),
1765 vep_offset,
1766 frame_complete,
1767 stack_slots / VMRegImpl::slots_per_word,
1768 in_ByteSize(-1),
1769 in_ByteSize(-1),
1770 nullptr);
1771 }
1772 address native_func = method->native_function();
1773 assert(native_func != nullptr, "must have function");
1774
1775 // An OopMap for lock (and class if static)
1776 OopMapSet *oop_maps = new OopMapSet();
1777 intptr_t start = (intptr_t)__ pc();
1778
1779 // We have received a description of where all the java arg are located
1780 // on entry to the wrapper. We need to convert these args to where
1781 // the jni function will expect them. To figure out where they go
1782 // we convert the java signature to a C signature by inserting
1783 // the hidden arguments as arg[0] and possibly arg[1] (static method)
1784
1785 const int total_in_args = method->size_of_parameters();
1786 int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1787
1788 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1789 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1790
1791 int argc = 0;
1792 out_sig_bt[argc++] = T_ADDRESS;
1793 if (method->is_static()) {
1794 out_sig_bt[argc++] = T_OBJECT;
1795 }
1796
1797 for (int i = 0; i < total_in_args ; i++ ) {
1798 out_sig_bt[argc++] = in_sig_bt[i];
1799 }
1800
1801 // Now figure out where the args must be stored and how much stack space
1802 // they require.
1803 int out_arg_slots;
1804 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
1805
1806 // Compute framesize for the wrapper. We need to handlize all oops in
1807 // incoming registers
1808
1809 // Calculate the total number of stack slots we will need.
1810
1811 // First count the abi requirement plus all of the outgoing args
1812 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1813
1814 // Now the space for the inbound oop handle area
1815 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers
1816
1817 int oop_handle_offset = stack_slots;
1818 stack_slots += total_save_slots;
1819
1820 // Now any space we need for handlizing a klass if static method
1821
1822 int klass_slot_offset = 0;
1823 int klass_offset = -1;
1824 int lock_slot_offset = 0;
1825 bool is_static = false;
1826
1827 if (method->is_static()) {
1828 klass_slot_offset = stack_slots;
1829 stack_slots += VMRegImpl::slots_per_word;
1830 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1831 is_static = true;
1832 }
1833
1834 // Plus a lock if needed
1835
1836 if (method->is_synchronized()) {
1837 lock_slot_offset = stack_slots;
1838 stack_slots += VMRegImpl::slots_per_word;
1839 }
1840
1841 // Now a place (+2) to save return values or temp during shuffling
1842 // + 4 for return address (which we own) and saved rbp
1843 stack_slots += 6;
1844
1845 // Ok The space we have allocated will look like:
1846 //
1847 //
1848 // FP-> | |
1849 // |---------------------|
1850 // | 2 slots for moves |
1851 // |---------------------|
1852 // | lock box (if sync) |
1853 // |---------------------| <- lock_slot_offset
1854 // | klass (if static) |
1855 // |---------------------| <- klass_slot_offset
1856 // | oopHandle area |
1857 // |---------------------| <- oop_handle_offset (6 java arg registers)
1858 // | outbound memory |
1859 // | based arguments |
1860 // | |
1861 // |---------------------|
1862 // | |
1863 // SP-> | out_preserved_slots |
1864 //
1865 //
1866
1867
1868 // Now compute actual number of stack words we need rounding to make
1869 // stack properly aligned.
1870 stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1871
1872 int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1873
1874 // First thing make an ic check to see if we should even be here
1875
1876 // We are free to use all registers as temps without saving them and
1877 // restoring them except rbp. rbp is the only callee save register
1878 // as far as the interpreter and the compiler(s) are concerned.
1879
1880 const Register receiver = j_rarg0;
1881
1882 Label exception_pending;
1883
1884 assert_different_registers(receiver, rscratch1, rscratch2);
1885 __ verify_oop(receiver);
1886 __ ic_check(8 /* end_alignment */);
1887
1888 int vep_offset = ((intptr_t)__ pc()) - start;
1889
1890 if (method->needs_clinit_barrier()) {
1891 assert(VM_Version::supports_fast_class_init_checks(), "sanity");
1892 Label L_skip_barrier;
1893 Register klass = r10;
1894 __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1895 __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
1896
1897 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1898
1899 __ bind(L_skip_barrier);
1900 }
1901
1902 #ifdef COMPILER1
1903 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1904 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1905 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1906 }
1907 #endif // COMPILER1
1908
1909 // The instruction at the verified entry point must be 5 bytes or longer
1910 // because it can be patched on the fly by make_non_entrant. The stack bang
1911 // instruction fits that requirement.
1912
1913 // Generate stack overflow check
1914 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1915
1916 // Generate a new frame for the wrapper.
1917 __ enter();
1918 // -2 because return address is already present and so is saved rbp
1919 __ subptr(rsp, stack_size - 2*wordSize);
1920
1921 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1922 // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
1923 bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
1924
1925 // Frame is now completed as far as size and linkage.
1926 int frame_complete = ((intptr_t)__ pc()) - start;
1927
1928 #ifdef ASSERT
1929 __ check_stack_alignment(rsp, "improperly aligned stack");
1930 #endif /* ASSERT */
1931
1932
1933 // We use r14 as the oop handle for the receiver/klass
1934 // It is callee save so it survives the call to native
1935
1936 const Register oop_handle_reg = r14;
1937
1938 //
1939 // We immediately shuffle the arguments so that any vm call we have to
1940 // make from here on out (sync slow path, jvmti, etc.) we will have
1941 // captured the oops from our caller and have a valid oopMap for
1942 // them.
1943
1944 // -----------------
1945 // The Grand Shuffle
1946
1947 // The Java calling convention is either equal (linux) or denser (win64) than the
1948 // c calling convention. However the because of the jni_env argument the c calling
1949 // convention always has at least one more (and two for static) arguments than Java.
1950 // Therefore if we move the args from java -> c backwards then we will never have
1951 // a register->register conflict and we don't have to build a dependency graph
1952 // and figure out how to break any cycles.
1953 //
1954
1955 // Record esp-based slot for receiver on stack for non-static methods
1956 int receiver_offset = -1;
1957
1958 // This is a trick. We double the stack slots so we can claim
1959 // the oops in the caller's frame. Since we are sure to have
1960 // more args than the caller doubling is enough to make
1961 // sure we can capture all the incoming oop args from the
1962 // caller.
1963 //
1964 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1965
1966 // Mark location of rbp (someday)
1967 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1968
1969 // Use eax, ebx as temporaries during any memory-memory moves we have to do
1970 // All inbound args are referenced based on rbp and all outbound args via rsp.
1971
1972
1973 #ifdef ASSERT
1974 bool reg_destroyed[Register::number_of_registers];
1975 bool freg_destroyed[XMMRegister::number_of_registers];
1976 for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
1977 reg_destroyed[r] = false;
1978 }
1979 for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
1980 freg_destroyed[f] = false;
1981 }
1982
1983 #endif /* ASSERT */
1984
1985 // For JNI natives the incoming and outgoing registers are offset upwards.
1986 GrowableArray<int> arg_order(2 * total_in_args);
1987
1988 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1989 arg_order.push(i);
1990 arg_order.push(c_arg);
1991 }
1992
1993 for (int ai = 0; ai < arg_order.length(); ai += 2) {
1994 int i = arg_order.at(ai);
1995 int c_arg = arg_order.at(ai + 1);
1996 __ block_comment(err_msg("move %d -> %d", i, c_arg));
1997 #ifdef ASSERT
1998 if (in_regs[i].first()->is_Register()) {
1999 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2000 } else if (in_regs[i].first()->is_XMMRegister()) {
2001 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2002 }
2003 if (out_regs[c_arg].first()->is_Register()) {
2004 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2005 } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2006 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2007 }
2008 #endif /* ASSERT */
2009 switch (in_sig_bt[i]) {
2010 case T_ARRAY:
2011 case T_OBJECT:
2012 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2013 ((i == 0) && (!is_static)),
2014 &receiver_offset);
2015 break;
2016 case T_VOID:
2017 break;
2018
2019 case T_FLOAT:
2020 __ float_move(in_regs[i], out_regs[c_arg]);
2021 break;
2022
2023 case T_DOUBLE:
2024 assert( i + 1 < total_in_args &&
2025 in_sig_bt[i + 1] == T_VOID &&
2026 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2027 __ double_move(in_regs[i], out_regs[c_arg]);
2028 break;
2029
2030 case T_LONG :
2031 __ long_move(in_regs[i], out_regs[c_arg]);
2032 break;
2033
2034 case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2035
2036 default:
2037 __ move32_64(in_regs[i], out_regs[c_arg]);
2038 }
2039 }
2040
2041 int c_arg;
2042
2043 // Pre-load a static method's oop into r14. Used both by locking code and
2044 // the normal JNI call code.
2045 // point c_arg at the first arg that is already loaded in case we
2046 // need to spill before we call out
2047 c_arg = total_c_args - total_in_args;
2048
2049 if (method->is_static()) {
2050
2051 // load oop into a register
2052 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2053
2054 // Now handlize the static class mirror it's known not-null.
2055 __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2056 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2057
2058 // Now get the handle
2059 __ lea(oop_handle_reg, Address(rsp, klass_offset));
2060 // store the klass handle as second argument
2061 __ movptr(c_rarg1, oop_handle_reg);
2062 // and protect the arg if we must spill
2063 c_arg--;
2064 }
2065
2066 // Change state to native (we save the return address in the thread, since it might not
2067 // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2068 // points into the right code segment. It does not have to be the correct return pc.
2069 // We use the same pc/oopMap repeatedly when we call out
2070
2071 Label native_return;
2072 if (method->is_object_wait0()) {
2073 // For convenience we use the pc we want to resume to in case of preemption on Object.wait.
2074 __ set_last_Java_frame(rsp, noreg, native_return, rscratch1);
2075 } else {
2076 intptr_t the_pc = (intptr_t) __ pc();
2077 oop_maps->add_gc_map(the_pc - start, map);
2078
2079 __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1);
2080 }
2081
2082 // We have all of the arguments setup at this point. We must not touch any register
2083 // argument registers at this point (what if we save/restore them there are no oop?
2084
2085 if (DTraceMethodProbes) {
2086 // protect the args we've loaded
2087 save_args(masm, total_c_args, c_arg, out_regs);
2088 __ mov_metadata(c_rarg1, method());
2089 __ call_VM_leaf(
2090 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2091 r15_thread, c_rarg1);
2092 restore_args(masm, total_c_args, c_arg, out_regs);
2093 }
2094
2095 // RedefineClasses() tracing support for obsolete method entry
2096 if (log_is_enabled(Trace, redefine, class, obsolete)) {
2097 // protect the args we've loaded
2098 save_args(masm, total_c_args, c_arg, out_regs);
2099 __ mov_metadata(c_rarg1, method());
2100 __ call_VM_leaf(
2101 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2102 r15_thread, c_rarg1);
2103 restore_args(masm, total_c_args, c_arg, out_regs);
2104 }
2105
2106 // Lock a synchronized method
2107
2108 // Register definitions used by locking and unlocking
2109
2110 const Register swap_reg = rax; // Must use rax for cmpxchg instruction
2111 const Register obj_reg = rbx; // Will contain the oop
2112 const Register lock_reg = r13; // Address of compiler lock object (BasicLock)
2113
2114 Label slow_path_lock;
2115 Label lock_done;
2116
2117 if (method->is_synchronized()) {
2118 // Get the handle (the 2nd argument)
2119 __ mov(oop_handle_reg, c_rarg1);
2120
2121 // Get address of the box
2122
2123 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2124
2125 // Load the oop from the handle
2126 __ movptr(obj_reg, Address(oop_handle_reg, 0));
2127
2128 __ fast_lock(lock_reg, obj_reg, swap_reg, rscratch1, slow_path_lock);
2129
2130 // Slow path will re-enter here
2131 __ bind(lock_done);
2132 }
2133
2134 // Finally just about ready to make the JNI call
2135
2136 // get JNIEnv* which is first argument to native
2137 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2138
2139 // Now set thread in native
2140 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2141
2142 __ call(RuntimeAddress(native_func));
2143
2144 // Verify or restore cpu control state after JNI call
2145 __ restore_cpu_control_state_after_jni(rscratch1);
2146
2147 // Unpack native results.
2148 switch (ret_type) {
2149 case T_BOOLEAN: __ c2bool(rax); break;
2150 case T_CHAR : __ movzwl(rax, rax); break;
2151 case T_BYTE : __ sign_extend_byte (rax); break;
2152 case T_SHORT : __ sign_extend_short(rax); break;
2153 case T_INT : /* nothing to do */ break;
2154 case T_DOUBLE :
2155 case T_FLOAT :
2156 // Result is in xmm0 we'll save as needed
2157 break;
2158 case T_ARRAY: // Really a handle
2159 case T_OBJECT: // Really a handle
2160 break; // can't de-handlize until after safepoint check
2161 case T_VOID: break;
2162 case T_LONG: break;
2163 default : ShouldNotReachHere();
2164 }
2165
2166 // Switch thread to "native transition" state before reading the synchronization state.
2167 // This additional state is necessary because reading and testing the synchronization
2168 // state is not atomic w.r.t. GC, as this scenario demonstrates:
2169 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2170 // VM thread changes sync state to synchronizing and suspends threads for GC.
2171 // Thread A is resumed to finish this native method, but doesn't block here since it
2172 // didn't see any synchronization is progress, and escapes.
2173 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2174
2175 // Force this write out before the read below
2176 if (!UseSystemMemoryBarrier) {
2177 __ membar(Assembler::Membar_mask_bits(
2178 Assembler::LoadLoad | Assembler::LoadStore |
2179 Assembler::StoreLoad | Assembler::StoreStore));
2180 }
2181
2182 // check for safepoint operation in progress and/or pending suspend requests
2183 {
2184 Label Continue;
2185 Label slow_path;
2186
2187 __ safepoint_poll(slow_path, true /* at_return */, false /* in_nmethod */);
2188
2189 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2190 __ jcc(Assembler::equal, Continue);
2191 __ bind(slow_path);
2192
2193 // Don't use call_VM as it will see a possible pending exception and forward it
2194 // and never return here preventing us from clearing _last_native_pc down below.
2195 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2196 // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2197 // by hand.
2198 //
2199 __ vzeroupper();
2200 save_native_result(masm, ret_type, stack_slots);
2201 __ mov(c_rarg0, r15_thread);
2202 __ mov(r12, rsp); // remember sp
2203 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2204 __ andptr(rsp, -16); // align stack as required by ABI
2205 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2206 __ mov(rsp, r12); // restore sp
2207 __ reinit_heapbase();
2208 // Restore any method result value
2209 restore_native_result(masm, ret_type, stack_slots);
2210 __ bind(Continue);
2211 }
2212
2213 // change thread state
2214 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2215
2216 if (method->is_object_wait0()) {
2217 // Check preemption for Object.wait()
2218 __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset()));
2219 __ cmpptr(rscratch1, NULL_WORD);
2220 __ jccb(Assembler::equal, native_return);
2221 __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD);
2222 __ jmp(rscratch1);
2223 __ bind(native_return);
2224
2225 intptr_t the_pc = (intptr_t) __ pc();
2226 oop_maps->add_gc_map(the_pc - start, map);
2227 }
2228
2229
2230 Label reguard;
2231 Label reguard_done;
2232 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2233 __ jcc(Assembler::equal, reguard);
2234 __ bind(reguard_done);
2235
2236 // native result if any is live
2237
2238 // Unlock
2239 Label slow_path_unlock;
2240 Label unlock_done;
2241 if (method->is_synchronized()) {
2242
2243 Label fast_done;
2244
2245 // Get locked oop from the handle we passed to jni
2246 __ movptr(obj_reg, Address(oop_handle_reg, 0));
2247
2248 // Must save rax if it is live now because cmpxchg must use it
2249 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2250 save_native_result(masm, ret_type, stack_slots);
2251 }
2252
2253 __ fast_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2254
2255 // slow path re-enters here
2256 __ bind(unlock_done);
2257 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2258 restore_native_result(masm, ret_type, stack_slots);
2259 }
2260
2261 __ bind(fast_done);
2262 }
2263 if (DTraceMethodProbes) {
2264 save_native_result(masm, ret_type, stack_slots);
2265 __ mov_metadata(c_rarg1, method());
2266 __ call_VM_leaf(
2267 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2268 r15_thread, c_rarg1);
2269 restore_native_result(masm, ret_type, stack_slots);
2270 }
2271
2272 __ reset_last_Java_frame(false);
2273
2274 // Unbox oop result, e.g. JNIHandles::resolve value.
2275 if (is_reference_type(ret_type)) {
2276 __ resolve_jobject(rax /* value */,
2277 rcx /* tmp */);
2278 }
2279
2280 if (CheckJNICalls) {
2281 // clear_pending_jni_exception_check
2282 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2283 }
2284
2285 // reset handle block
2286 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2287 __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2288
2289 // pop our frame
2290
2291 __ leave();
2292
2293 #if INCLUDE_JFR
2294 // We need to do a poll test after unwind in case the sampler
2295 // managed to sample the native frame after returning to Java.
2296 Label L_return;
2297 address poll_test_pc = __ pc();
2298 __ relocate(relocInfo::poll_return_type);
2299 __ testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2300 __ jccb(Assembler::zero, L_return);
2301 __ lea(rscratch1, InternalAddress(poll_test_pc));
2302 __ movptr(Address(r15_thread, JavaThread::saved_exception_pc_offset()), rscratch1);
2303 assert(SharedRuntime::polling_page_return_handler_blob() != nullptr,
2304 "polling page return stub not created yet");
2305 address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
2306 __ jump(RuntimeAddress(stub));
2307 __ bind(L_return);
2308 #endif // INCLUDE_JFR
2309
2310 // Any exception pending?
2311 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2312 __ jcc(Assembler::notEqual, exception_pending);
2313
2314 // Return
2315
2316 __ ret(0);
2317
2318 // Unexpected paths are out of line and go here
2319
2320 // forward the exception
2321 __ bind(exception_pending);
2322
2323 // and forward the exception
2324 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2325
2326 // Slow path locking & unlocking
2327 if (method->is_synchronized()) {
2328
2329 // BEGIN Slow path lock
2330 __ bind(slow_path_lock);
2331
2332 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2333 // args are (oop obj, BasicLock* lock, JavaThread* thread)
2334
2335 // protect the args we've loaded
2336 save_args(masm, total_c_args, c_arg, out_regs);
2337
2338 __ mov(c_rarg0, obj_reg);
2339 __ mov(c_rarg1, lock_reg);
2340 __ mov(c_rarg2, r15_thread);
2341
2342 // Not a leaf but we have last_Java_frame setup as we want.
2343 // We don't want to unmount in case of contention since that would complicate preserving
2344 // the arguments that had already been marshalled into the native convention. So we force
2345 // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame())
2346 // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack.
2347 __ push_cont_fastpath();
2348 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2349 __ pop_cont_fastpath();
2350 restore_args(masm, total_c_args, c_arg, out_regs);
2351
2352 #ifdef ASSERT
2353 { Label L;
2354 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2355 __ jcc(Assembler::equal, L);
2356 __ stop("no pending exception allowed on exit from monitorenter");
2357 __ bind(L);
2358 }
2359 #endif
2360 __ jmp(lock_done);
2361
2362 // END Slow path lock
2363
2364 // BEGIN Slow path unlock
2365 __ bind(slow_path_unlock);
2366
2367 // If we haven't already saved the native result we must save it now as xmm registers
2368 // are still exposed.
2369 __ vzeroupper();
2370 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2371 save_native_result(masm, ret_type, stack_slots);
2372 }
2373
2374 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2375
2376 __ mov(c_rarg0, obj_reg);
2377 __ mov(c_rarg2, r15_thread);
2378 __ mov(r12, rsp); // remember sp
2379 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2380 __ andptr(rsp, -16); // align stack as required by ABI
2381
2382 // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2383 // NOTE that obj_reg == rbx currently
2384 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2385 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2386
2387 // args are (oop obj, BasicLock* lock, JavaThread* thread)
2388 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2389 __ mov(rsp, r12); // restore sp
2390 __ reinit_heapbase();
2391 #ifdef ASSERT
2392 {
2393 Label L;
2394 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2395 __ jcc(Assembler::equal, L);
2396 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2397 __ bind(L);
2398 }
2399 #endif /* ASSERT */
2400
2401 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2402
2403 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2404 restore_native_result(masm, ret_type, stack_slots);
2405 }
2406 __ jmp(unlock_done);
2407
2408 // END Slow path unlock
2409
2410 } // synchronized
2411
2412 // SLOW PATH Reguard the stack if needed
2413
2414 __ bind(reguard);
2415 __ vzeroupper();
2416 save_native_result(masm, ret_type, stack_slots);
2417 __ mov(r12, rsp); // remember sp
2418 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2419 __ andptr(rsp, -16); // align stack as required by ABI
2420 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2421 __ mov(rsp, r12); // restore sp
2422 __ reinit_heapbase();
2423 restore_native_result(masm, ret_type, stack_slots);
2424 // and continue
2425 __ jmp(reguard_done);
2426
2427
2428
2429 __ flush();
2430
2431 nmethod *nm = nmethod::new_native_nmethod(method,
2432 compile_id,
2433 masm->code(),
2434 vep_offset,
2435 frame_complete,
2436 stack_slots / VMRegImpl::slots_per_word,
2437 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2438 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2439 oop_maps);
2440
2441 return nm;
2442 }
2443
2444 // this function returns the adjust size (in number of words) to a c2i adapter
2445 // activation for use during deoptimization
2446 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2447 return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2448 }
2449
2450
2451 uint SharedRuntime::out_preserve_stack_slots() {
2452 return 0;
2453 }
2454
2455
2456 // Number of stack slots between incoming argument block and the start of
2457 // a new frame. The PROLOG must add this many slots to the stack. The
2458 // EPILOG must remove this many slots. amd64 needs two slots for
2459 // return address.
2460 uint SharedRuntime::in_preserve_stack_slots() {
2461 return 4 + 2 * VerifyStackAtCalls;
2462 }
2463
2464 VMReg SharedRuntime::thread_register() {
2465 return r15_thread->as_VMReg();
2466 }
2467
2468 //------------------------------generate_deopt_blob----------------------------
2469 void SharedRuntime::generate_deopt_blob() {
2470 // Allocate space for the code
2471 ResourceMark rm;
2472 // Setup code generation tools
2473 int pad = 0;
2474 if (UseAVX > 2) {
2475 pad += 1024;
2476 }
2477 if (UseAPX) {
2478 pad += 1024;
2479 }
2480 const char* name = SharedRuntime::stub_name(StubId::shared_deopt_id);
2481 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
2482 if (blob != nullptr) {
2483 _deopt_blob = blob->as_deoptimization_blob();
2484 return;
2485 }
2486
2487 CodeBuffer buffer(name, 2560+pad, 1024);
2488 MacroAssembler* masm = new MacroAssembler(&buffer);
2489 int frame_size_in_words;
2490 OopMap* map = nullptr;
2491 OopMapSet *oop_maps = new OopMapSet();
2492
2493 // -------------
2494 // This code enters when returning to a de-optimized nmethod. A return
2495 // address has been pushed on the stack, and return values are in
2496 // registers.
2497 // If we are doing a normal deopt then we were called from the patched
2498 // nmethod from the point we returned to the nmethod. So the return
2499 // address on the stack is wrong by NativeCall::instruction_size
2500 // We will adjust the value so it looks like we have the original return
2501 // address on the stack (like when we eagerly deoptimized).
2502 // In the case of an exception pending when deoptimizing, we enter
2503 // with a return address on the stack that points after the call we patched
2504 // into the exception handler. We have the following register state from,
2505 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2506 // rax: exception oop
2507 // rbx: exception handler
2508 // rdx: throwing pc
2509 // So in this case we simply jam rdx into the useless return address and
2510 // the stack looks just like we want.
2511 //
2512 // At this point we need to de-opt. We save the argument return
2513 // registers. We call the first C routine, fetch_unroll_info(). This
2514 // routine captures the return values and returns a structure which
2515 // describes the current frame size and the sizes of all replacement frames.
2516 // The current frame is compiled code and may contain many inlined
2517 // functions, each with their own JVM state. We pop the current frame, then
2518 // push all the new frames. Then we call the C routine unpack_frames() to
2519 // populate these frames. Finally unpack_frames() returns us the new target
2520 // address. Notice that callee-save registers are BLOWN here; they have
2521 // already been captured in the vframeArray at the time the return PC was
2522 // patched.
2523 address start = __ pc();
2524 Label cont;
2525
2526 // Prolog for non exception case!
2527
2528 // Save everything in sight.
2529 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2530
2531 // Normal deoptimization. Save exec mode for unpack_frames.
2532 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2533 __ jmp(cont);
2534
2535 int reexecute_offset = __ pc() - start;
2536 // Reexecute case
2537 // return address is the pc describes what bci to do re-execute at
2538
2539 // No need to update map as each call to save_live_registers will produce identical oopmap
2540 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2541
2542 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2543 __ jmp(cont);
2544
2545 int exception_offset = __ pc() - start;
2546
2547 // Prolog for exception case
2548
2549 // all registers are dead at this entry point, except for rax, and
2550 // rdx which contain the exception oop and exception pc
2551 // respectively. Set them in TLS and fall thru to the
2552 // unpack_with_exception_in_tls entry point.
2553
2554 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2555 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2556
2557 int exception_in_tls_offset = __ pc() - start;
2558
2559 // new implementation because exception oop is now passed in JavaThread
2560
2561 // Prolog for exception case
2562 // All registers must be preserved because they might be used by LinearScan
2563 // Exceptiop oop and throwing PC are passed in JavaThread
2564 // tos: stack at point of call to method that threw the exception (i.e. only
2565 // args are on the stack, no return address)
2566
2567 // make room on stack for the return address
2568 // It will be patched later with the throwing pc. The correct value is not
2569 // available now because loading it from memory would destroy registers.
2570 __ push(0);
2571
2572 // Save everything in sight.
2573 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2574
2575 // Now it is safe to overwrite any register
2576
2577 // Deopt during an exception. Save exec mode for unpack_frames.
2578 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2579
2580 // load throwing pc from JavaThread and patch it as the return address
2581 // of the current frame. Then clear the field in JavaThread
2582
2583 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2584 __ movptr(Address(rbp, wordSize), rdx);
2585 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2586
2587 #ifdef ASSERT
2588 // verify that there is really an exception oop in JavaThread
2589 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2590 __ verify_oop(rax);
2591
2592 // verify that there is no pending exception
2593 Label no_pending_exception;
2594 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2595 __ testptr(rax, rax);
2596 __ jcc(Assembler::zero, no_pending_exception);
2597 __ stop("must not have pending exception here");
2598 __ bind(no_pending_exception);
2599 #endif
2600
2601 __ bind(cont);
2602
2603 // Call C code. Need thread and this frame, but NOT official VM entry
2604 // crud. We cannot block on this call, no GC can happen.
2605 //
2606 // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2607
2608 // fetch_unroll_info needs to call last_java_frame().
2609
2610 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2611 #ifdef ASSERT
2612 { Label L;
2613 __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2614 __ jcc(Assembler::equal, L);
2615 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2616 __ bind(L);
2617 }
2618 #endif // ASSERT
2619 __ mov(c_rarg0, r15_thread);
2620 __ movl(c_rarg1, r14); // exec_mode
2621 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2622
2623 // Need to have an oopmap that tells fetch_unroll_info where to
2624 // find any register it might need.
2625 oop_maps->add_gc_map(__ pc() - start, map);
2626
2627 __ reset_last_Java_frame(false);
2628
2629 // Load UnrollBlock* into rdi
2630 __ mov(rdi, rax);
2631
2632 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2633 Label noException;
2634 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending?
2635 __ jcc(Assembler::notEqual, noException);
2636 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2637 // QQQ this is useless it was null above
2638 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2639 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2640 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2641
2642 __ verify_oop(rax);
2643
2644 // Overwrite the result registers with the exception results.
2645 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2646 // I think this is useless
2647 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2648
2649 __ bind(noException);
2650
2651 // Only register save data is on the stack.
2652 // Now restore the result registers. Everything else is either dead
2653 // or captured in the vframeArray.
2654 RegisterSaver::restore_result_registers(masm);
2655
2656 // All of the register save area has been popped of the stack. Only the
2657 // return address remains.
2658
2659 // Pop all the frames we must move/replace.
2660 //
2661 // Frame picture (youngest to oldest)
2662 // 1: self-frame (no frame link)
2663 // 2: deopting frame (no frame link)
2664 // 3: caller of deopting frame (could be compiled/interpreted).
2665 //
2666 // Note: by leaving the return address of self-frame on the stack
2667 // and using the size of frame 2 to adjust the stack
2668 // when we are done the return to frame 3 will still be on the stack.
2669
2670 // Pop deoptimized frame
2671 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2672 __ addptr(rsp, rcx);
2673
2674 // rsp should be pointing at the return address to the caller (3)
2675
2676 // Pick up the initial fp we should save
2677 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2678 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2679
2680 #ifdef ASSERT
2681 // Compilers generate code that bang the stack by as much as the
2682 // interpreter would need. So this stack banging should never
2683 // trigger a fault. Verify that it does not on non product builds.
2684 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2685 __ bang_stack_size(rbx, rcx);
2686 #endif
2687
2688 // Load address of array of frame pcs into rcx
2689 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2690
2691 // Trash the old pc
2692 __ addptr(rsp, wordSize);
2693
2694 // Load address of array of frame sizes into rsi
2695 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2696
2697 // Load counter into rdx
2698 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2699
2700 // Now adjust the caller's stack to make up for the extra locals
2701 // but record the original sp so that we can save it in the skeletal interpreter
2702 // frame and the stack walking of interpreter_sender will get the unextended sp
2703 // value and not the "real" sp value.
2704
2705 const Register sender_sp = r8;
2706
2707 __ mov(sender_sp, rsp);
2708 __ movl(rbx, Address(rdi,
2709 Deoptimization::UnrollBlock::
2710 caller_adjustment_offset()));
2711 __ subptr(rsp, rbx);
2712
2713 // Push interpreter frames in a loop
2714 Label loop;
2715 __ bind(loop);
2716 __ movptr(rbx, Address(rsi, 0)); // Load frame size
2717 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand
2718 __ pushptr(Address(rcx, 0)); // Save return address
2719 __ enter(); // Save old & set new ebp
2720 __ subptr(rsp, rbx); // Prolog
2721 // This value is corrected by layout_activation_impl
2722 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2723 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2724 __ mov(sender_sp, rsp); // Pass sender_sp to next frame
2725 __ addptr(rsi, wordSize); // Bump array pointer (sizes)
2726 __ addptr(rcx, wordSize); // Bump array pointer (pcs)
2727 __ decrementl(rdx); // Decrement counter
2728 __ jcc(Assembler::notZero, loop);
2729 __ pushptr(Address(rcx, 0)); // Save final return address
2730
2731 // Re-push self-frame
2732 __ enter(); // Save old & set new ebp
2733
2734 // Allocate a full sized register save area.
2735 // Return address and rbp are in place, so we allocate two less words.
2736 __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2737
2738 // Restore frame locals after moving the frame
2739 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2740 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2741
2742 // Call C code. Need thread but NOT official VM entry
2743 // crud. We cannot block on this call, no GC can happen. Call should
2744 // restore return values to their stack-slots with the new SP.
2745 //
2746 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2747
2748 // Use rbp because the frames look interpreted now
2749 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2750 // Don't need the precise return PC here, just precise enough to point into this code blob.
2751 address the_pc = __ pc();
2752 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2753
2754 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI
2755 __ mov(c_rarg0, r15_thread);
2756 __ movl(c_rarg1, r14); // second arg: exec_mode
2757 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2758 // Revert SP alignment after call since we're going to do some SP relative addressing below
2759 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2760
2761 // Set an oopmap for the call site
2762 // Use the same PC we used for the last java frame
2763 oop_maps->add_gc_map(the_pc - start,
2764 new OopMap( frame_size_in_words, 0 ));
2765
2766 // Clear fp AND pc
2767 __ reset_last_Java_frame(true);
2768
2769 // Collect return values
2770 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2771 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2772 // I think this is useless (throwing pc?)
2773 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2774
2775 // Pop self-frame.
2776 __ leave(); // Epilog
2777
2778 // Jump to interpreter
2779 __ ret(0);
2780
2781 // Make sure all code is generated
2782 masm->flush();
2783
2784 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2785 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2786
2787 AOTCodeCache::store_code_blob(*_deopt_blob, AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
2788 }
2789
2790 //------------------------------generate_handler_blob------
2791 //
2792 // Generate a special Compile2Runtime blob that saves all registers,
2793 // and setup oopmap.
2794 //
2795 SafepointBlob* SharedRuntime::generate_handler_blob(StubId id, address call_ptr) {
2796 assert(StubRoutines::forward_exception_entry() != nullptr,
2797 "must be generated before");
2798 assert(is_polling_page_id(id), "expected a polling page stub id");
2799
2800 // Allocate space for the code. Setup code generation tools.
2801 const char* name = SharedRuntime::stub_name(id);
2802 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
2803 if (blob != nullptr) {
2804 return blob->as_safepoint_blob();
2805 }
2806
2807 ResourceMark rm;
2808 OopMapSet *oop_maps = new OopMapSet();
2809 OopMap* map;
2810 CodeBuffer buffer(name, 2548, 1024);
2811 MacroAssembler* masm = new MacroAssembler(&buffer);
2812
2813 address start = __ pc();
2814 address call_pc = nullptr;
2815 int frame_size_in_words;
2816 bool cause_return = (id == StubId::shared_polling_page_return_handler_id);
2817 bool save_wide_vectors = (id == StubId::shared_polling_page_vectors_safepoint_handler_id);
2818
2819 // Make room for return address (or push it again)
2820 if (!cause_return) {
2821 __ push(rbx);
2822 }
2823
2824 // Save registers, fpu state, and flags
2825 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
2826
2827 // The following is basically a call_VM. However, we need the precise
2828 // address of the call in order to generate an oopmap. Hence, we do all the
2829 // work ourselves.
2830
2831 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
2832
2833 // The return address must always be correct so that frame constructor never
2834 // sees an invalid pc.
2835
2836 if (!cause_return) {
2837 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
2838 // Additionally, rbx is a callee saved register and we can look at it later to determine
2839 // if someone changed the return address for us!
2840 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
2841 __ movptr(Address(rbp, wordSize), rbx);
2842 }
2843
2844 // Do the call
2845 __ mov(c_rarg0, r15_thread);
2846 __ call(RuntimeAddress(call_ptr));
2847
2848 // Set an oopmap for the call site. This oopmap will map all
2849 // oop-registers and debug-info registers as callee-saved. This
2850 // will allow deoptimization at this safepoint to find all possible
2851 // debug-info recordings, as well as let GC find all oops.
2852
2853 oop_maps->add_gc_map( __ pc() - start, map);
2854
2855 Label noException;
2856
2857 __ reset_last_Java_frame(false);
2858
2859 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
2860 __ jcc(Assembler::equal, noException);
2861
2862 // Exception pending
2863
2864 RegisterSaver::restore_live_registers(masm, save_wide_vectors);
2865
2866 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2867
2868 // No exception case
2869 __ bind(noException);
2870
2871 Label no_adjust;
2872 #ifdef ASSERT
2873 Label bail;
2874 #endif
2875 if (!cause_return) {
2876 Label no_prefix, not_special, check_rex_prefix;
2877
2878 // If our stashed return pc was modified by the runtime we avoid touching it
2879 __ cmpptr(rbx, Address(rbp, wordSize));
2880 __ jcc(Assembler::notEqual, no_adjust);
2881
2882 // Skip over the poll instruction.
2883 // See NativeInstruction::is_safepoint_poll()
2884 // Possible encodings:
2885 // 85 00 test %eax,(%rax)
2886 // 85 01 test %eax,(%rcx)
2887 // 85 02 test %eax,(%rdx)
2888 // 85 03 test %eax,(%rbx)
2889 // 85 06 test %eax,(%rsi)
2890 // 85 07 test %eax,(%rdi)
2891 //
2892 // 41 85 00 test %eax,(%r8)
2893 // 41 85 01 test %eax,(%r9)
2894 // 41 85 02 test %eax,(%r10)
2895 // 41 85 03 test %eax,(%r11)
2896 // 41 85 06 test %eax,(%r14)
2897 // 41 85 07 test %eax,(%r15)
2898 //
2899 // 85 04 24 test %eax,(%rsp)
2900 // 41 85 04 24 test %eax,(%r12)
2901 // 85 45 00 test %eax,0x0(%rbp)
2902 // 41 85 45 00 test %eax,0x0(%r13)
2903 //
2904 // Notes:
2905 // Format of legacy MAP0 test instruction:-
2906 // [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32]
2907 // o For safepoint polling instruction "test %eax,(%rax)", encoding of first register
2908 // operand and base register of memory operand is b/w [0-8), hence we do not require
2909 // additional REX prefix where REX.B bit stores MSB bit of register encoding, which
2910 // is why two bytes encoding is sufficient here.
2911 // o For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE
2912 // register of memory operand is 1000, thus we need additional REX prefix in this case,
2913 // there by adding additional byte to instruction encoding.
2914 // o In case BASE register is one of the 32 extended GPR registers available only on targets
2915 // supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold
2916 // most significant two bits of 5 bit register encoding.
2917
2918 if (VM_Version::supports_apx_f()) {
2919 __ cmpb(Address(rbx, 0), Assembler::REX2);
2920 __ jccb(Assembler::notEqual, check_rex_prefix);
2921 __ addptr(rbx, 2);
2922 __ bind(check_rex_prefix);
2923 }
2924 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
2925 __ jccb(Assembler::notEqual, no_prefix);
2926 __ addptr(rbx, 1);
2927 __ bind(no_prefix);
2928 #ifdef ASSERT
2929 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
2930 #endif
2931 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
2932 // r12/rsp 0x04
2933 // r13/rbp 0x05
2934 __ movzbq(rcx, Address(rbx, 1));
2935 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
2936 __ subptr(rcx, 4); // looking for 0x00 .. 0x01
2937 __ cmpptr(rcx, 1);
2938 __ jccb(Assembler::above, not_special);
2939 __ addptr(rbx, 1);
2940 __ bind(not_special);
2941 #ifdef ASSERT
2942 // Verify the correct encoding of the poll we're about to skip.
2943 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
2944 __ jcc(Assembler::notEqual, bail);
2945 // Mask out the modrm bits
2946 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
2947 // rax encodes to 0, so if the bits are nonzero it's incorrect
2948 __ jcc(Assembler::notZero, bail);
2949 #endif
2950 // Adjust return pc forward to step over the safepoint poll instruction
2951 __ addptr(rbx, 2);
2952 __ movptr(Address(rbp, wordSize), rbx);
2953 }
2954
2955 __ bind(no_adjust);
2956 // Normal exit, restore registers and exit.
2957 RegisterSaver::restore_live_registers(masm, save_wide_vectors);
2958 __ ret(0);
2959
2960 #ifdef ASSERT
2961 __ bind(bail);
2962 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
2963 #endif
2964
2965 // Make sure all code is generated
2966 masm->flush();
2967
2968 // Fill-out other meta info
2969 SafepointBlob* sp_blob = SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
2970
2971 AOTCodeCache::store_code_blob(*sp_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
2972 return sp_blob;
2973 }
2974
2975 //
2976 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
2977 //
2978 // Generate a stub that calls into vm to find out the proper destination
2979 // of a java call. All the argument registers are live at this point
2980 // but since this is generic code we don't know what they are and the caller
2981 // must do any gc of the args.
2982 //
2983 RuntimeStub* SharedRuntime::generate_resolve_blob(StubId id, address destination) {
2984 assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
2985 assert(is_resolve_id(id), "expected a resolve stub id");
2986
2987 const char* name = SharedRuntime::stub_name(id);
2988 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
2989 if (blob != nullptr) {
2990 return blob->as_runtime_stub();
2991 }
2992
2993 // allocate space for the code
2994 ResourceMark rm;
2995 CodeBuffer buffer(name, 1552, 512);
2996 MacroAssembler* masm = new MacroAssembler(&buffer);
2997
2998 int frame_size_in_words;
2999
3000 OopMapSet *oop_maps = new OopMapSet();
3001 OopMap* map = nullptr;
3002
3003 int start = __ offset();
3004
3005 // No need to save vector registers since they are caller-saved anyway.
3006 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3007
3008 int frame_complete = __ offset();
3009
3010 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3011
3012 __ mov(c_rarg0, r15_thread);
3013
3014 __ call(RuntimeAddress(destination));
3015
3016
3017 // Set an oopmap for the call site.
3018 // We need this not only for callee-saved registers, but also for volatile
3019 // registers that the compiler might be keeping live across a safepoint.
3020
3021 oop_maps->add_gc_map( __ offset() - start, map);
3022
3023 // rax contains the address we are going to jump to assuming no exception got installed
3024
3025 // clear last_Java_sp
3026 __ reset_last_Java_frame(false);
3027 // check for pending exceptions
3028 Label pending;
3029 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3030 __ jcc(Assembler::notEqual, pending);
3031
3032 // get the returned Method*
3033 __ get_vm_result_metadata(rbx);
3034 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3035
3036 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3037
3038 RegisterSaver::restore_live_registers(masm);
3039
3040 // We are back to the original state on entry and ready to go.
3041
3042 __ jmp(rax);
3043
3044 // Pending exception after the safepoint
3045
3046 __ bind(pending);
3047
3048 RegisterSaver::restore_live_registers(masm);
3049
3050 // exception pending => remove activation and forward to exception handler
3051
3052 __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
3053
3054 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3055 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3056
3057 // -------------
3058 // make sure all code is generated
3059 masm->flush();
3060
3061 // return the blob
3062 // frame_size_words or bytes??
3063 RuntimeStub* rs_blob = RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3064
3065 AOTCodeCache::store_code_blob(*rs_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3066 return rs_blob;
3067 }
3068
3069 // Continuation point for throwing of implicit exceptions that are
3070 // not handled in the current activation. Fabricates an exception
3071 // oop and initiates normal exception dispatching in this
3072 // frame. Since we need to preserve callee-saved values (currently
3073 // only for C2, but done for C1 as well) we need a callee-saved oop
3074 // map and therefore have to make these stubs into RuntimeStubs
3075 // rather than BufferBlobs. If the compiler needs all registers to
3076 // be preserved between the fault point and the exception handler
3077 // then it must assume responsibility for that in
3078 // AbstractCompiler::continuation_for_implicit_null_exception or
3079 // continuation_for_implicit_division_by_zero_exception. All other
3080 // implicit exceptions (e.g., NullPointerException or
3081 // AbstractMethodError on entry) are either at call sites or
3082 // otherwise assume that stack unwinding will be initiated, so
3083 // caller saved registers were assumed volatile in the compiler.
3084 RuntimeStub* SharedRuntime::generate_throw_exception(StubId id, address runtime_entry) {
3085 assert(is_throw_id(id), "expected a throw stub id");
3086
3087 const char* name = SharedRuntime::stub_name(id);
3088
3089 // Information about frame layout at time of blocking runtime call.
3090 // Note that we only have to preserve callee-saved registers since
3091 // the compilers are responsible for supplying a continuation point
3092 // if they expect all registers to be preserved.
3093 enum layout {
3094 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3095 rbp_off2,
3096 return_off,
3097 return_off2,
3098 framesize // inclusive of return address
3099 };
3100
3101 int insts_size = 512;
3102 int locs_size = 64;
3103
3104 const char* timer_msg = "SharedRuntime generate_throw_exception";
3105 TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3106
3107 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3108 if (blob != nullptr) {
3109 return blob->as_runtime_stub();
3110 }
3111
3112 ResourceMark rm;
3113 CodeBuffer code(name, insts_size, locs_size);
3114 OopMapSet* oop_maps = new OopMapSet();
3115 MacroAssembler* masm = new MacroAssembler(&code);
3116
3117 address start = __ pc();
3118
3119 // This is an inlined and slightly modified version of call_VM
3120 // which has the ability to fetch the return PC out of
3121 // thread-local storage and also sets up last_Java_sp slightly
3122 // differently than the real call_VM
3123
3124 __ enter(); // required for proper stackwalking of RuntimeStub frame
3125
3126 assert(is_even(framesize/2), "sp not 16-byte aligned");
3127
3128 // return address and rbp are already in place
3129 __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3130
3131 int frame_complete = __ pc() - start;
3132
3133 // Set up last_Java_sp and last_Java_fp
3134 address the_pc = __ pc();
3135 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3136 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack
3137
3138 // Call runtime
3139 __ movptr(c_rarg0, r15_thread);
3140 BLOCK_COMMENT("call runtime_entry");
3141 __ call(RuntimeAddress(runtime_entry));
3142
3143 // Generate oop map
3144 OopMap* map = new OopMap(framesize, 0);
3145
3146 oop_maps->add_gc_map(the_pc - start, map);
3147
3148 __ reset_last_Java_frame(true);
3149
3150 __ leave(); // required for proper stackwalking of RuntimeStub frame
3151
3152 // check for pending exceptions
3153 #ifdef ASSERT
3154 Label L;
3155 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3156 __ jcc(Assembler::notEqual, L);
3157 __ should_not_reach_here();
3158 __ bind(L);
3159 #endif // ASSERT
3160 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3161
3162
3163 // codeBlob framesize is in words (not VMRegImpl::slot_size)
3164 RuntimeStub* stub =
3165 RuntimeStub::new_runtime_stub(name,
3166 &code,
3167 frame_complete,
3168 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3169 oop_maps, false);
3170 AOTCodeCache::store_code_blob(*stub, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3171
3172 return stub;
3173 }
3174
3175 //------------------------------Montgomery multiplication------------------------
3176 //
3177
3178 #ifndef _WINDOWS
3179
3180 // Subtract 0:b from carry:a. Return carry.
3181 static julong
3182 sub(julong a[], julong b[], julong carry, long len) {
3183 long long i = 0, cnt = len;
3184 julong tmp;
3185 asm volatile("clc; "
3186 "0: ; "
3187 "mov (%[b], %[i], 8), %[tmp]; "
3188 "sbb %[tmp], (%[a], %[i], 8); "
3189 "inc %[i]; dec %[cnt]; "
3190 "jne 0b; "
3191 "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3192 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3193 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3194 : "memory");
3195 return tmp;
3196 }
3197
3198 // Multiply (unsigned) Long A by Long B, accumulating the double-
3199 // length result into the accumulator formed of T0, T1, and T2.
3200 #define MACC(A, B, T0, T1, T2) \
3201 do { \
3202 unsigned long hi, lo; \
3203 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
3204 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
3205 : "r"(A), "a"(B) : "cc"); \
3206 } while(0)
3207
3208 // As above, but add twice the double-length result into the
3209 // accumulator.
3210 #define MACC2(A, B, T0, T1, T2) \
3211 do { \
3212 unsigned long hi, lo; \
3213 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3214 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
3215 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
3216 : "r"(A), "a"(B) : "cc"); \
3217 } while(0)
3218
3219 #else //_WINDOWS
3220
3221 static julong
3222 sub(julong a[], julong b[], julong carry, long len) {
3223 long i;
3224 julong tmp;
3225 unsigned char c = 1;
3226 for (i = 0; i < len; i++) {
3227 c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3228 a[i] = tmp;
3229 }
3230 c = _addcarry_u64(c, carry, ~0, &tmp);
3231 return tmp;
3232 }
3233
3234 // Multiply (unsigned) Long A by Long B, accumulating the double-
3235 // length result into the accumulator formed of T0, T1, and T2.
3236 #define MACC(A, B, T0, T1, T2) \
3237 do { \
3238 julong hi, lo; \
3239 lo = _umul128(A, B, &hi); \
3240 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
3241 c = _addcarry_u64(c, hi, T1, &T1); \
3242 _addcarry_u64(c, T2, 0, &T2); \
3243 } while(0)
3244
3245 // As above, but add twice the double-length result into the
3246 // accumulator.
3247 #define MACC2(A, B, T0, T1, T2) \
3248 do { \
3249 julong hi, lo; \
3250 lo = _umul128(A, B, &hi); \
3251 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
3252 c = _addcarry_u64(c, hi, T1, &T1); \
3253 _addcarry_u64(c, T2, 0, &T2); \
3254 c = _addcarry_u64(0, lo, T0, &T0); \
3255 c = _addcarry_u64(c, hi, T1, &T1); \
3256 _addcarry_u64(c, T2, 0, &T2); \
3257 } while(0)
3258
3259 #endif //_WINDOWS
3260
3261 // Fast Montgomery multiplication. The derivation of the algorithm is
3262 // in A Cryptographic Library for the Motorola DSP56000,
3263 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3264
3265 static void NOINLINE
3266 montgomery_multiply(julong a[], julong b[], julong n[],
3267 julong m[], julong inv, int len) {
3268 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3269 int i;
3270
3271 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3272
3273 for (i = 0; i < len; i++) {
3274 int j;
3275 for (j = 0; j < i; j++) {
3276 MACC(a[j], b[i-j], t0, t1, t2);
3277 MACC(m[j], n[i-j], t0, t1, t2);
3278 }
3279 MACC(a[i], b[0], t0, t1, t2);
3280 m[i] = t0 * inv;
3281 MACC(m[i], n[0], t0, t1, t2);
3282
3283 assert(t0 == 0, "broken Montgomery multiply");
3284
3285 t0 = t1; t1 = t2; t2 = 0;
3286 }
3287
3288 for (i = len; i < 2*len; i++) {
3289 int j;
3290 for (j = i-len+1; j < len; j++) {
3291 MACC(a[j], b[i-j], t0, t1, t2);
3292 MACC(m[j], n[i-j], t0, t1, t2);
3293 }
3294 m[i-len] = t0;
3295 t0 = t1; t1 = t2; t2 = 0;
3296 }
3297
3298 while (t0)
3299 t0 = sub(m, n, t0, len);
3300 }
3301
3302 // Fast Montgomery squaring. This uses asymptotically 25% fewer
3303 // multiplies so it should be up to 25% faster than Montgomery
3304 // multiplication. However, its loop control is more complex and it
3305 // may actually run slower on some machines.
3306
3307 static void NOINLINE
3308 montgomery_square(julong a[], julong n[],
3309 julong m[], julong inv, int len) {
3310 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3311 int i;
3312
3313 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3314
3315 for (i = 0; i < len; i++) {
3316 int j;
3317 int end = (i+1)/2;
3318 for (j = 0; j < end; j++) {
3319 MACC2(a[j], a[i-j], t0, t1, t2);
3320 MACC(m[j], n[i-j], t0, t1, t2);
3321 }
3322 if ((i & 1) == 0) {
3323 MACC(a[j], a[j], t0, t1, t2);
3324 }
3325 for (; j < i; j++) {
3326 MACC(m[j], n[i-j], t0, t1, t2);
3327 }
3328 m[i] = t0 * inv;
3329 MACC(m[i], n[0], t0, t1, t2);
3330
3331 assert(t0 == 0, "broken Montgomery square");
3332
3333 t0 = t1; t1 = t2; t2 = 0;
3334 }
3335
3336 for (i = len; i < 2*len; i++) {
3337 int start = i-len+1;
3338 int end = start + (len - start)/2;
3339 int j;
3340 for (j = start; j < end; j++) {
3341 MACC2(a[j], a[i-j], t0, t1, t2);
3342 MACC(m[j], n[i-j], t0, t1, t2);
3343 }
3344 if ((i & 1) == 0) {
3345 MACC(a[j], a[j], t0, t1, t2);
3346 }
3347 for (; j < len; j++) {
3348 MACC(m[j], n[i-j], t0, t1, t2);
3349 }
3350 m[i-len] = t0;
3351 t0 = t1; t1 = t2; t2 = 0;
3352 }
3353
3354 while (t0)
3355 t0 = sub(m, n, t0, len);
3356 }
3357
3358 // Swap words in a longword.
3359 static julong swap(julong x) {
3360 return (x << 32) | (x >> 32);
3361 }
3362
3363 // Copy len longwords from s to d, word-swapping as we go. The
3364 // destination array is reversed.
3365 static void reverse_words(julong *s, julong *d, int len) {
3366 d += len;
3367 while(len-- > 0) {
3368 d--;
3369 *d = swap(*s);
3370 s++;
3371 }
3372 }
3373
3374 // The threshold at which squaring is advantageous was determined
3375 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3376 #define MONTGOMERY_SQUARING_THRESHOLD 64
3377
3378 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3379 jint len, jlong inv,
3380 jint *m_ints) {
3381 assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3382 int longwords = len/2;
3383
3384 // Make very sure we don't use so much space that the stack might
3385 // overflow. 512 jints corresponds to an 16384-bit integer and
3386 // will use here a total of 8k bytes of stack space.
3387 int divisor = sizeof(julong) * 4;
3388 guarantee(longwords <= 8192 / divisor, "must be");
3389 int total_allocation = longwords * sizeof (julong) * 4;
3390 julong *scratch = (julong *)alloca(total_allocation);
3391
3392 // Local scratch arrays
3393 julong
3394 *a = scratch + 0 * longwords,
3395 *b = scratch + 1 * longwords,
3396 *n = scratch + 2 * longwords,
3397 *m = scratch + 3 * longwords;
3398
3399 reverse_words((julong *)a_ints, a, longwords);
3400 reverse_words((julong *)b_ints, b, longwords);
3401 reverse_words((julong *)n_ints, n, longwords);
3402
3403 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3404
3405 reverse_words(m, (julong *)m_ints, longwords);
3406 }
3407
3408 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3409 jint len, jlong inv,
3410 jint *m_ints) {
3411 assert(len % 2 == 0, "array length in montgomery_square must be even");
3412 int longwords = len/2;
3413
3414 // Make very sure we don't use so much space that the stack might
3415 // overflow. 512 jints corresponds to an 16384-bit integer and
3416 // will use here a total of 6k bytes of stack space.
3417 int divisor = sizeof(julong) * 3;
3418 guarantee(longwords <= (8192 / divisor), "must be");
3419 int total_allocation = longwords * sizeof (julong) * 3;
3420 julong *scratch = (julong *)alloca(total_allocation);
3421
3422 // Local scratch arrays
3423 julong
3424 *a = scratch + 0 * longwords,
3425 *n = scratch + 1 * longwords,
3426 *m = scratch + 2 * longwords;
3427
3428 reverse_words((julong *)a_ints, a, longwords);
3429 reverse_words((julong *)n_ints, n, longwords);
3430
3431 if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3432 ::montgomery_square(a, n, m, (julong)inv, longwords);
3433 } else {
3434 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3435 }
3436
3437 reverse_words(m, (julong *)m_ints, longwords);
3438 }
3439
3440 #if INCLUDE_JFR
3441
3442 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
3443 // It returns a jobject handle to the event writer.
3444 // The handle is dereferenced and the return value is the event writer oop.
3445 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
3446 enum layout {
3447 rbp_off,
3448 rbpH_off,
3449 return_off,
3450 return_off2,
3451 framesize // inclusive of return address
3452 };
3453
3454 const char* name = SharedRuntime::stub_name(StubId::shared_jfr_write_checkpoint_id);
3455 CodeBuffer code(name, 1024, 64);
3456 MacroAssembler* masm = new MacroAssembler(&code);
3457 address start = __ pc();
3458
3459 __ enter();
3460 address the_pc = __ pc();
3461
3462 int frame_complete = the_pc - start;
3463
3464 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3465 __ movptr(c_rarg0, r15_thread);
3466 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
3467 __ reset_last_Java_frame(true);
3468
3469 // rax is jobject handle result, unpack and process it through a barrier.
3470 __ resolve_global_jobject(rax, c_rarg0);
3471
3472 __ leave();
3473 __ ret(0);
3474
3475 OopMapSet* oop_maps = new OopMapSet();
3476 OopMap* map = new OopMap(framesize, 1);
3477 oop_maps->add_gc_map(frame_complete, map);
3478
3479 RuntimeStub* stub =
3480 RuntimeStub::new_runtime_stub(name,
3481 &code,
3482 frame_complete,
3483 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3484 oop_maps,
3485 false);
3486 return stub;
3487 }
3488
3489 // For c2: call to return a leased buffer.
3490 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
3491 enum layout {
3492 rbp_off,
3493 rbpH_off,
3494 return_off,
3495 return_off2,
3496 framesize // inclusive of return address
3497 };
3498
3499 const char* name = SharedRuntime::stub_name(StubId::shared_jfr_return_lease_id);
3500 CodeBuffer code(name, 1024, 64);
3501 MacroAssembler* masm = new MacroAssembler(&code);
3502 address start = __ pc();
3503
3504 __ enter();
3505 address the_pc = __ pc();
3506
3507 int frame_complete = the_pc - start;
3508
3509 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
3510 __ movptr(c_rarg0, r15_thread);
3511 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
3512 __ reset_last_Java_frame(true);
3513
3514 __ leave();
3515 __ ret(0);
3516
3517 OopMapSet* oop_maps = new OopMapSet();
3518 OopMap* map = new OopMap(framesize, 1);
3519 oop_maps->add_gc_map(frame_complete, map);
3520
3521 RuntimeStub* stub =
3522 RuntimeStub::new_runtime_stub(name,
3523 &code,
3524 frame_complete,
3525 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3526 oop_maps,
3527 false);
3528 return stub;
3529 }
3530
3531 #endif // INCLUDE_JFR