1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #ifndef _WINDOWS 26 #include "alloca.h" 27 #endif 28 #include "asm/macroAssembler.hpp" 29 #include "asm/macroAssembler.inline.hpp" 30 #include "code/compiledIC.hpp" 31 #include "code/debugInfoRec.hpp" 32 #include "code/nativeInst.hpp" 33 #include "code/vtableStubs.hpp" 34 #include "compiler/oopMap.hpp" 35 #include "gc/shared/collectedHeap.hpp" 36 #include "gc/shared/gcLocker.hpp" 37 #include "gc/shared/barrierSet.hpp" 38 #include "gc/shared/barrierSetAssembler.hpp" 39 #include "interpreter/interpreter.hpp" 40 #include "logging/log.hpp" 41 #include "memory/resourceArea.hpp" 42 #include "memory/universe.hpp" 43 #include "oops/klass.inline.hpp" 44 #include "oops/method.inline.hpp" 45 #include "prims/methodHandles.hpp" 46 #include "runtime/continuation.hpp" 47 #include "runtime/continuationEntry.inline.hpp" 48 #include "runtime/globals.hpp" 49 #include "runtime/jniHandles.hpp" 50 #include "runtime/safepointMechanism.hpp" 51 #include "runtime/sharedRuntime.hpp" 52 #include "runtime/signature.hpp" 53 #include "runtime/stubRoutines.hpp" 54 #include "runtime/timerTrace.hpp" 55 #include "runtime/vframeArray.hpp" 56 #include "runtime/vm_version.hpp" 57 #include "utilities/align.hpp" 58 #include "utilities/checkedCast.hpp" 59 #include "utilities/formatBuffer.hpp" 60 #include "vmreg_x86.inline.hpp" 61 #ifdef COMPILER1 62 #include "c1/c1_Runtime1.hpp" 63 #endif 64 #ifdef COMPILER2 65 #include "opto/runtime.hpp" 66 #endif 67 #if INCLUDE_JVMCI 68 #include "jvmci/jvmciJavaClasses.hpp" 69 #endif 70 71 #define __ masm-> 72 73 #ifdef PRODUCT 74 #define BLOCK_COMMENT(str) /* nothing */ 75 #else 76 #define BLOCK_COMMENT(str) __ block_comment(str) 77 #endif // PRODUCT 78 79 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size; 80 81 class RegisterSaver { 82 // Capture info about frame layout. Layout offsets are in jint 83 // units because compiler frame slots are jints. 84 #define XSAVE_AREA_BEGIN 160 85 #define XSAVE_AREA_YMM_BEGIN 576 86 #define XSAVE_AREA_EGPRS 960 87 #define XSAVE_AREA_OPMASK_BEGIN 1088 88 #define XSAVE_AREA_ZMM_BEGIN 1152 89 #define XSAVE_AREA_UPPERBANK 1664 90 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off 91 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off 92 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off 93 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off 94 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off 95 enum layout { 96 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area 97 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area 98 DEF_XMM_OFFS(0), 99 DEF_XMM_OFFS(1), 100 // 2..15 are implied in range usage 101 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 102 DEF_YMM_OFFS(0), 103 DEF_YMM_OFFS(1), 104 r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt, 105 r16H_off, 106 r17_off, r17H_off, 107 r18_off, r18H_off, 108 r19_off, r19H_off, 109 r20_off, r20H_off, 110 r21_off, r21H_off, 111 r22_off, r22H_off, 112 r23_off, r23H_off, 113 r24_off, r24H_off, 114 r25_off, r25H_off, 115 r26_off, r26H_off, 116 r27_off, r27H_off, 117 r28_off, r28H_off, 118 r29_off, r29H_off, 119 r30_off, r30H_off, 120 r31_off, r31H_off, 121 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 122 DEF_OPMASK_OFFS(0), 123 DEF_OPMASK_OFFS(1), 124 // 2..7 are implied in range usage 125 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 126 DEF_ZMM_OFFS(0), 127 DEF_ZMM_OFFS(1), 128 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt, 129 DEF_ZMM_UPPER_OFFS(16), 130 DEF_ZMM_UPPER_OFFS(17), 131 // 18..31 are implied in range usage 132 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt), 133 fpu_stateH_end, 134 r15_off, r15H_off, 135 r14_off, r14H_off, 136 r13_off, r13H_off, 137 r12_off, r12H_off, 138 r11_off, r11H_off, 139 r10_off, r10H_off, 140 r9_off, r9H_off, 141 r8_off, r8H_off, 142 rdi_off, rdiH_off, 143 rsi_off, rsiH_off, 144 ignore_off, ignoreH_off, // extra copy of rbp 145 rsp_off, rspH_off, 146 rbx_off, rbxH_off, 147 rdx_off, rdxH_off, 148 rcx_off, rcxH_off, 149 rax_off, raxH_off, 150 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state 151 align_off, alignH_off, 152 flags_off, flagsH_off, 153 // The frame sender code expects that rbp will be in the "natural" place and 154 // will override any oopMap setting for it. We must therefore force the layout 155 // so that it agrees with the frame sender code. 156 rbp_off, rbpH_off, // copy of rbp we will restore 157 return_off, returnH_off, // slot for return address 158 reg_save_size // size in compiler stack slots 159 }; 160 161 public: 162 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors); 163 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false); 164 165 // Offsets into the register save area 166 // Used by deoptimization when it is managing result register 167 // values on its own 168 169 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; } 170 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; } 171 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; } 172 static int r15_offset_in_bytes(void) { return BytesPerInt * r15_off; } 173 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; } 174 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; } 175 176 // During deoptimization only the result registers need to be restored, 177 // all the other values have already been extracted. 178 static void restore_result_registers(MacroAssembler* masm); 179 }; 180 181 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) { 182 int off = 0; 183 int num_xmm_regs = XMMRegister::available_xmm_registers(); 184 #if COMPILER2_OR_JVMCI 185 if (save_wide_vectors && UseAVX == 0) { 186 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX 187 } 188 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 189 #else 190 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI 191 #endif 192 193 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated 194 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs); 195 // OopMap frame size is in compiler stack slots (jint's) not bytes or words 196 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 197 // CodeBlob frame size is in words. 198 int frame_size_in_words = frame_size_in_bytes / wordSize; 199 *total_frame_words = frame_size_in_words; 200 201 // Save registers, fpu state, and flags. 202 // We assume caller has already pushed the return address onto the 203 // stack, so rsp is 8-byte aligned here. 204 // We push rpb twice in this sequence because we want the real rbp 205 // to be under the return like a normal enter. 206 207 __ enter(); // rsp becomes 16-byte aligned here 208 __ pushf(); 209 // Make sure rsp stays 16-byte aligned 210 __ subq(rsp, 8); 211 // Push CPU state in multiple of 16 bytes 212 __ save_legacy_gprs(); 213 __ push_FPU_state(); 214 215 216 // push cpu state handles this on EVEX enabled targets 217 if (save_wide_vectors) { 218 // Save upper half of YMM registers(0..15) 219 int base_addr = XSAVE_AREA_YMM_BEGIN; 220 for (int n = 0; n < 16; n++) { 221 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n)); 222 } 223 if (VM_Version::supports_evex()) { 224 // Save upper half of ZMM registers(0..15) 225 base_addr = XSAVE_AREA_ZMM_BEGIN; 226 for (int n = 0; n < 16; n++) { 227 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n)); 228 } 229 // Save full ZMM registers(16..num_xmm_regs) 230 base_addr = XSAVE_AREA_UPPERBANK; 231 off = 0; 232 int vector_len = Assembler::AVX_512bit; 233 for (int n = 16; n < num_xmm_regs; n++) { 234 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 235 } 236 #if COMPILER2_OR_JVMCI 237 base_addr = XSAVE_AREA_OPMASK_BEGIN; 238 off = 0; 239 for(int n = 0; n < KRegister::number_of_registers; n++) { 240 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 241 } 242 #endif 243 } 244 } else { 245 if (VM_Version::supports_evex()) { 246 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 247 int base_addr = XSAVE_AREA_UPPERBANK; 248 off = 0; 249 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 250 for (int n = 16; n < num_xmm_regs; n++) { 251 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 252 } 253 #if COMPILER2_OR_JVMCI 254 base_addr = XSAVE_AREA_OPMASK_BEGIN; 255 off = 0; 256 for(int n = 0; n < KRegister::number_of_registers; n++) { 257 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 258 } 259 #endif 260 } 261 } 262 263 #if COMPILER2_OR_JVMCI 264 if (UseAPX) { 265 int base_addr = XSAVE_AREA_EGPRS; 266 off = 0; 267 for (int n = 16; n < Register::number_of_registers; n++) { 268 __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n)); 269 } 270 } 271 #endif 272 273 __ vzeroupper(); 274 if (frame::arg_reg_save_area_bytes != 0) { 275 // Allocate argument register save area 276 __ subptr(rsp, frame::arg_reg_save_area_bytes); 277 } 278 279 // Set an oopmap for the call site. This oopmap will map all 280 // oop-registers and debug-info registers as callee-saved. This 281 // will allow deoptimization at this safepoint to find all possible 282 // debug-info recordings, as well as let GC find all oops. 283 284 OopMapSet *oop_maps = new OopMapSet(); 285 OopMap* map = new OopMap(frame_size_in_slots, 0); 286 287 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x)) 288 289 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg()); 290 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg()); 291 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg()); 292 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg()); 293 // rbp location is known implicitly by the frame sender code, needs no oopmap 294 // and the location where rbp was saved by is ignored 295 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg()); 296 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg()); 297 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg()); 298 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg()); 299 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg()); 300 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg()); 301 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg()); 302 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg()); 303 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg()); 304 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg()); 305 306 if (UseAPX) { 307 map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg()); 308 map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg()); 309 map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg()); 310 map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg()); 311 map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg()); 312 map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg()); 313 map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg()); 314 map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg()); 315 map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg()); 316 map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg()); 317 map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg()); 318 map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg()); 319 map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg()); 320 map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg()); 321 map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg()); 322 map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg()); 323 } 324 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 325 // on EVEX enabled targets, we get it included in the xsave area 326 off = xmm0_off; 327 int delta = xmm1_off - off; 328 for (int n = 0; n < 16; n++) { 329 XMMRegister xmm_name = as_XMMRegister(n); 330 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()); 331 off += delta; 332 } 333 if (UseAVX > 2) { 334 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 335 off = zmm16_off; 336 delta = zmm17_off - off; 337 for (int n = 16; n < num_xmm_regs; n++) { 338 XMMRegister zmm_name = as_XMMRegister(n); 339 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()); 340 off += delta; 341 } 342 } 343 344 #if COMPILER2_OR_JVMCI 345 if (save_wide_vectors) { 346 // Save upper half of YMM registers(0..15) 347 off = ymm0_off; 348 delta = ymm1_off - ymm0_off; 349 for (int n = 0; n < 16; n++) { 350 XMMRegister ymm_name = as_XMMRegister(n); 351 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4)); 352 off += delta; 353 } 354 if (VM_Version::supports_evex()) { 355 // Save upper half of ZMM registers(0..15) 356 off = zmm0_off; 357 delta = zmm1_off - zmm0_off; 358 for (int n = 0; n < 16; n++) { 359 XMMRegister zmm_name = as_XMMRegister(n); 360 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8)); 361 off += delta; 362 } 363 } 364 } 365 #endif // COMPILER2_OR_JVMCI 366 367 // %%% These should all be a waste but we'll keep things as they were for now 368 if (true) { 369 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next()); 370 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next()); 371 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next()); 372 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next()); 373 // rbp location is known implicitly by the frame sender code, needs no oopmap 374 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next()); 375 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next()); 376 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next()); 377 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next()); 378 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next()); 379 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next()); 380 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next()); 381 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next()); 382 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next()); 383 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next()); 384 if (UseAPX) { 385 map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next()); 386 map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next()); 387 map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next()); 388 map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next()); 389 map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next()); 390 map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next()); 391 map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next()); 392 map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next()); 393 map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next()); 394 map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next()); 395 map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next()); 396 map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next()); 397 map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next()); 398 map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next()); 399 map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next()); 400 map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next()); 401 } 402 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 403 // on EVEX enabled targets, we get it included in the xsave area 404 off = xmm0H_off; 405 delta = xmm1H_off - off; 406 for (int n = 0; n < 16; n++) { 407 XMMRegister xmm_name = as_XMMRegister(n); 408 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next()); 409 off += delta; 410 } 411 if (UseAVX > 2) { 412 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 413 off = zmm16H_off; 414 delta = zmm17H_off - off; 415 for (int n = 16; n < num_xmm_regs; n++) { 416 XMMRegister zmm_name = as_XMMRegister(n); 417 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next()); 418 off += delta; 419 } 420 } 421 } 422 423 return map; 424 } 425 426 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) { 427 int num_xmm_regs = XMMRegister::available_xmm_registers(); 428 if (frame::arg_reg_save_area_bytes != 0) { 429 // Pop arg register save area 430 __ addptr(rsp, frame::arg_reg_save_area_bytes); 431 } 432 433 #if COMPILER2_OR_JVMCI 434 if (restore_wide_vectors) { 435 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); 436 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 437 } 438 #else 439 assert(!restore_wide_vectors, "vectors are generated only by C2"); 440 #endif 441 442 __ vzeroupper(); 443 444 // On EVEX enabled targets everything is handled in pop fpu state 445 if (restore_wide_vectors) { 446 // Restore upper half of YMM registers (0..15) 447 int base_addr = XSAVE_AREA_YMM_BEGIN; 448 for (int n = 0; n < 16; n++) { 449 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16)); 450 } 451 if (VM_Version::supports_evex()) { 452 // Restore upper half of ZMM registers (0..15) 453 base_addr = XSAVE_AREA_ZMM_BEGIN; 454 for (int n = 0; n < 16; n++) { 455 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32)); 456 } 457 // Restore full ZMM registers(16..num_xmm_regs) 458 base_addr = XSAVE_AREA_UPPERBANK; 459 int vector_len = Assembler::AVX_512bit; 460 int off = 0; 461 for (int n = 16; n < num_xmm_regs; n++) { 462 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 463 } 464 #if COMPILER2_OR_JVMCI 465 base_addr = XSAVE_AREA_OPMASK_BEGIN; 466 off = 0; 467 for (int n = 0; n < KRegister::number_of_registers; n++) { 468 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 469 } 470 #endif 471 } 472 } else { 473 if (VM_Version::supports_evex()) { 474 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 475 int base_addr = XSAVE_AREA_UPPERBANK; 476 int off = 0; 477 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 478 for (int n = 16; n < num_xmm_regs; n++) { 479 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 480 } 481 #if COMPILER2_OR_JVMCI 482 base_addr = XSAVE_AREA_OPMASK_BEGIN; 483 off = 0; 484 for (int n = 0; n < KRegister::number_of_registers; n++) { 485 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 486 } 487 #endif 488 } 489 } 490 491 #if COMPILER2_OR_JVMCI 492 if (UseAPX) { 493 int base_addr = XSAVE_AREA_EGPRS; 494 int off = 0; 495 for (int n = 16; n < Register::number_of_registers; n++) { 496 __ movq(as_Register(n), Address(rsp, base_addr+(off++*8))); 497 } 498 } 499 #endif 500 501 // Recover CPU state 502 __ pop_FPU_state(); 503 __ restore_legacy_gprs(); 504 __ addq(rsp, 8); 505 __ popf(); 506 // Get the rbp described implicitly by the calling convention (no oopMap) 507 __ pop(rbp); 508 } 509 510 void RegisterSaver::restore_result_registers(MacroAssembler* masm) { 511 512 // Just restore result register. Only used by deoptimization. By 513 // now any callee save register that needs to be restored to a c2 514 // caller of the deoptee has been extracted into the vframeArray 515 // and will be stuffed into the c2i adapter we create for later 516 // restoration so only result registers need to be restored here. 517 518 // Restore fp result register 519 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes())); 520 // Restore integer result register 521 __ movptr(rax, Address(rsp, rax_offset_in_bytes())); 522 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes())); 523 524 // Pop all of the register save are off the stack except the return address 525 __ addptr(rsp, return_offset_in_bytes()); 526 } 527 528 // Is vector's size (in bytes) bigger than a size saved by default? 529 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions. 530 bool SharedRuntime::is_wide_vector(int size) { 531 return size > 16; 532 } 533 534 // --------------------------------------------------------------------------- 535 // Read the array of BasicTypes from a signature, and compute where the 536 // arguments should go. Values in the VMRegPair regs array refer to 4-byte 537 // quantities. Values less than VMRegImpl::stack0 are registers, those above 538 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer 539 // as framesizes are fixed. 540 // VMRegImpl::stack0 refers to the first slot 0(sp). 541 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher. 542 // Register up to Register::number_of_registers are the 64-bit 543 // integer registers. 544 545 // Note: the INPUTS in sig_bt are in units of Java argument words, which are 546 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit 547 // units regardless of build. Of course for i486 there is no 64 bit build 548 549 // The Java calling convention is a "shifted" version of the C ABI. 550 // By skipping the first C ABI register we can call non-static jni methods 551 // with small numbers of arguments without having to shuffle the arguments 552 // at all. Since we control the java ABI we ought to at least get some 553 // advantage out of it. 554 555 int SharedRuntime::java_calling_convention(const BasicType *sig_bt, 556 VMRegPair *regs, 557 int total_args_passed) { 558 559 // Create the mapping between argument positions and 560 // registers. 561 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = { 562 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5 563 }; 564 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = { 565 j_farg0, j_farg1, j_farg2, j_farg3, 566 j_farg4, j_farg5, j_farg6, j_farg7 567 }; 568 569 570 uint int_args = 0; 571 uint fp_args = 0; 572 uint stk_args = 0; 573 574 for (int i = 0; i < total_args_passed; i++) { 575 switch (sig_bt[i]) { 576 case T_BOOLEAN: 577 case T_CHAR: 578 case T_BYTE: 579 case T_SHORT: 580 case T_INT: 581 if (int_args < Argument::n_int_register_parameters_j) { 582 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 583 } else { 584 stk_args = align_up(stk_args, 2); 585 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 586 stk_args += 1; 587 } 588 break; 589 case T_VOID: 590 // halves of T_LONG or T_DOUBLE 591 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 592 regs[i].set_bad(); 593 break; 594 case T_LONG: 595 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 596 // fall through 597 case T_OBJECT: 598 case T_ARRAY: 599 case T_ADDRESS: 600 if (int_args < Argument::n_int_register_parameters_j) { 601 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 602 } else { 603 stk_args = align_up(stk_args, 2); 604 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 605 stk_args += 2; 606 } 607 break; 608 case T_FLOAT: 609 if (fp_args < Argument::n_float_register_parameters_j) { 610 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 611 } else { 612 stk_args = align_up(stk_args, 2); 613 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 614 stk_args += 1; 615 } 616 break; 617 case T_DOUBLE: 618 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 619 if (fp_args < Argument::n_float_register_parameters_j) { 620 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 621 } else { 622 stk_args = align_up(stk_args, 2); 623 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 624 stk_args += 2; 625 } 626 break; 627 default: 628 ShouldNotReachHere(); 629 break; 630 } 631 } 632 633 return stk_args; 634 } 635 636 // Patch the callers callsite with entry to compiled code if it exists. 637 static void patch_callers_callsite(MacroAssembler *masm) { 638 Label L; 639 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 640 __ jcc(Assembler::equal, L); 641 642 // Save the current stack pointer 643 __ mov(r13, rsp); 644 // Schedule the branch target address early. 645 // Call into the VM to patch the caller, then jump to compiled callee 646 // rax isn't live so capture return address while we easily can 647 __ movptr(rax, Address(rsp, 0)); 648 649 // align stack so push_CPU_state doesn't fault 650 __ andptr(rsp, -(StackAlignmentInBytes)); 651 __ push_CPU_state(); 652 __ vzeroupper(); 653 // VM needs caller's callsite 654 // VM needs target method 655 // This needs to be a long call since we will relocate this adapter to 656 // the codeBuffer and it may not reach 657 658 // Allocate argument register save area 659 if (frame::arg_reg_save_area_bytes != 0) { 660 __ subptr(rsp, frame::arg_reg_save_area_bytes); 661 } 662 __ mov(c_rarg0, rbx); 663 __ mov(c_rarg1, rax); 664 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite))); 665 666 // De-allocate argument register save area 667 if (frame::arg_reg_save_area_bytes != 0) { 668 __ addptr(rsp, frame::arg_reg_save_area_bytes); 669 } 670 671 __ vzeroupper(); 672 __ pop_CPU_state(); 673 // restore sp 674 __ mov(rsp, r13); 675 __ bind(L); 676 } 677 678 679 static void gen_c2i_adapter(MacroAssembler *masm, 680 int total_args_passed, 681 int comp_args_on_stack, 682 const BasicType *sig_bt, 683 const VMRegPair *regs, 684 Label& skip_fixup) { 685 // Before we get into the guts of the C2I adapter, see if we should be here 686 // at all. We've come from compiled code and are attempting to jump to the 687 // interpreter, which means the caller made a static call to get here 688 // (vcalls always get a compiled target if there is one). Check for a 689 // compiled target. If there is one, we need to patch the caller's call. 690 patch_callers_callsite(masm); 691 692 __ bind(skip_fixup); 693 694 // Since all args are passed on the stack, total_args_passed * 695 // Interpreter::stackElementSize is the space we need. 696 697 assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed); 698 699 int extraspace = (total_args_passed * Interpreter::stackElementSize); 700 701 // stack is aligned, keep it that way 702 // This is not currently needed or enforced by the interpreter, but 703 // we might as well conform to the ABI. 704 extraspace = align_up(extraspace, 2*wordSize); 705 706 // set senderSP value 707 __ lea(r13, Address(rsp, wordSize)); 708 709 #ifdef ASSERT 710 __ check_stack_alignment(r13, "sender stack not aligned"); 711 #endif 712 if (extraspace > 0) { 713 // Pop the return address 714 __ pop(rax); 715 716 __ subptr(rsp, extraspace); 717 718 // Push the return address 719 __ push(rax); 720 721 // Account for the return address location since we store it first rather 722 // than hold it in a register across all the shuffling 723 extraspace += wordSize; 724 } 725 726 #ifdef ASSERT 727 __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax); 728 #endif 729 730 // Now write the args into the outgoing interpreter space 731 for (int i = 0; i < total_args_passed; i++) { 732 if (sig_bt[i] == T_VOID) { 733 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 734 continue; 735 } 736 737 // offset to start parameters 738 int st_off = (total_args_passed - i) * Interpreter::stackElementSize; 739 int next_off = st_off - Interpreter::stackElementSize; 740 741 // Say 4 args: 742 // i st_off 743 // 0 32 T_LONG 744 // 1 24 T_VOID 745 // 2 16 T_OBJECT 746 // 3 8 T_BOOL 747 // - 0 return address 748 // 749 // However to make thing extra confusing. Because we can fit a long/double in 750 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter 751 // leaves one slot empty and only stores to a single slot. In this case the 752 // slot that is occupied is the T_VOID slot. See I said it was confusing. 753 754 VMReg r_1 = regs[i].first(); 755 VMReg r_2 = regs[i].second(); 756 if (!r_1->is_valid()) { 757 assert(!r_2->is_valid(), ""); 758 continue; 759 } 760 if (r_1->is_stack()) { 761 // memory to memory use rax 762 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace; 763 if (!r_2->is_valid()) { 764 // sign extend?? 765 __ movl(rax, Address(rsp, ld_off)); 766 __ movptr(Address(rsp, st_off), rax); 767 768 } else { 769 770 __ movq(rax, Address(rsp, ld_off)); 771 772 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 773 // T_DOUBLE and T_LONG use two slots in the interpreter 774 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 775 // ld_off == LSW, ld_off+wordSize == MSW 776 // st_off == MSW, next_off == LSW 777 __ movq(Address(rsp, next_off), rax); 778 #ifdef ASSERT 779 // Overwrite the unused slot with known junk 780 __ mov64(rax, CONST64(0xdeadffffdeadaaaa)); 781 __ movptr(Address(rsp, st_off), rax); 782 #endif /* ASSERT */ 783 } else { 784 __ movq(Address(rsp, st_off), rax); 785 } 786 } 787 } else if (r_1->is_Register()) { 788 Register r = r_1->as_Register(); 789 if (!r_2->is_valid()) { 790 // must be only an int (or less ) so move only 32bits to slot 791 // why not sign extend?? 792 __ movl(Address(rsp, st_off), r); 793 } else { 794 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 795 // T_DOUBLE and T_LONG use two slots in the interpreter 796 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 797 // long/double in gpr 798 #ifdef ASSERT 799 // Overwrite the unused slot with known junk 800 __ mov64(rax, CONST64(0xdeadffffdeadaaab)); 801 __ movptr(Address(rsp, st_off), rax); 802 #endif /* ASSERT */ 803 __ movq(Address(rsp, next_off), r); 804 } else { 805 __ movptr(Address(rsp, st_off), r); 806 } 807 } 808 } else { 809 assert(r_1->is_XMMRegister(), ""); 810 if (!r_2->is_valid()) { 811 // only a float use just part of the slot 812 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister()); 813 } else { 814 #ifdef ASSERT 815 // Overwrite the unused slot with known junk 816 __ mov64(rax, CONST64(0xdeadffffdeadaaac)); 817 __ movptr(Address(rsp, st_off), rax); 818 #endif /* ASSERT */ 819 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister()); 820 } 821 } 822 } 823 824 // Schedule the branch target address early. 825 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset()))); 826 __ jmp(rcx); 827 } 828 829 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg, 830 address code_start, address code_end, 831 Label& L_ok) { 832 Label L_fail; 833 __ lea(temp_reg, AddressLiteral(code_start, relocInfo::none)); 834 __ cmpptr(pc_reg, temp_reg); 835 __ jcc(Assembler::belowEqual, L_fail); 836 __ lea(temp_reg, AddressLiteral(code_end, relocInfo::none)); 837 __ cmpptr(pc_reg, temp_reg); 838 __ jcc(Assembler::below, L_ok); 839 __ bind(L_fail); 840 } 841 842 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm, 843 int total_args_passed, 844 int comp_args_on_stack, 845 const BasicType *sig_bt, 846 const VMRegPair *regs) { 847 848 // Note: r13 contains the senderSP on entry. We must preserve it since 849 // we may do a i2c -> c2i transition if we lose a race where compiled 850 // code goes non-entrant while we get args ready. 851 // In addition we use r13 to locate all the interpreter args as 852 // we must align the stack to 16 bytes on an i2c entry else we 853 // lose alignment we expect in all compiled code and register 854 // save code can segv when fxsave instructions find improperly 855 // aligned stack pointer. 856 857 // Adapters can be frameless because they do not require the caller 858 // to perform additional cleanup work, such as correcting the stack pointer. 859 // An i2c adapter is frameless because the *caller* frame, which is interpreted, 860 // routinely repairs its own stack pointer (from interpreter_frame_last_sp), 861 // even if a callee has modified the stack pointer. 862 // A c2i adapter is frameless because the *callee* frame, which is interpreted, 863 // routinely repairs its caller's stack pointer (from sender_sp, which is set 864 // up via the senderSP register). 865 // In other words, if *either* the caller or callee is interpreted, we can 866 // get the stack pointer repaired after a call. 867 // This is why c2i and i2c adapters cannot be indefinitely composed. 868 // In particular, if a c2i adapter were to somehow call an i2c adapter, 869 // both caller and callee would be compiled methods, and neither would 870 // clean up the stack pointer changes performed by the two adapters. 871 // If this happens, control eventually transfers back to the compiled 872 // caller, but with an uncorrected stack, causing delayed havoc. 873 874 if (VerifyAdapterCalls && 875 (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) { 876 // So, let's test for cascading c2i/i2c adapters right now. 877 // assert(Interpreter::contains($return_addr) || 878 // StubRoutines::contains($return_addr), 879 // "i2c adapter must return to an interpreter frame"); 880 __ block_comment("verify_i2c { "); 881 // Pick up the return address 882 __ movptr(rax, Address(rsp, 0)); 883 Label L_ok; 884 if (Interpreter::code() != nullptr) { 885 range_check(masm, rax, r11, 886 Interpreter::code()->code_start(), 887 Interpreter::code()->code_end(), 888 L_ok); 889 } 890 if (StubRoutines::initial_stubs_code() != nullptr) { 891 range_check(masm, rax, r11, 892 StubRoutines::initial_stubs_code()->code_begin(), 893 StubRoutines::initial_stubs_code()->code_end(), 894 L_ok); 895 } 896 if (StubRoutines::final_stubs_code() != nullptr) { 897 range_check(masm, rax, r11, 898 StubRoutines::final_stubs_code()->code_begin(), 899 StubRoutines::final_stubs_code()->code_end(), 900 L_ok); 901 } 902 const char* msg = "i2c adapter must return to an interpreter frame"; 903 __ block_comment(msg); 904 __ stop(msg); 905 __ bind(L_ok); 906 __ block_comment("} verify_i2ce "); 907 } 908 909 // Must preserve original SP for loading incoming arguments because 910 // we need to align the outgoing SP for compiled code. 911 __ movptr(r11, rsp); 912 913 // Pick up the return address 914 __ pop(rax); 915 916 // Convert 4-byte c2 stack slots to words. 917 int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord; 918 919 if (comp_args_on_stack) { 920 __ subptr(rsp, comp_words_on_stack * wordSize); 921 } 922 923 // Ensure compiled code always sees stack at proper alignment 924 __ andptr(rsp, -16); 925 926 // push the return address and misalign the stack that youngest frame always sees 927 // as far as the placement of the call instruction 928 __ push(rax); 929 930 // Put saved SP in another register 931 const Register saved_sp = rax; 932 __ movptr(saved_sp, r11); 933 934 // Will jump to the compiled code just as if compiled code was doing it. 935 // Pre-load the register-jump target early, to schedule it better. 936 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset()))); 937 938 #if INCLUDE_JVMCI 939 if (EnableJVMCI) { 940 // check if this call should be routed towards a specific entry point 941 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 942 Label no_alternative_target; 943 __ jcc(Assembler::equal, no_alternative_target); 944 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset()))); 945 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 946 __ bind(no_alternative_target); 947 } 948 #endif // INCLUDE_JVMCI 949 950 // Now generate the shuffle code. Pick up all register args and move the 951 // rest through the floating point stack top. 952 for (int i = 0; i < total_args_passed; i++) { 953 if (sig_bt[i] == T_VOID) { 954 // Longs and doubles are passed in native word order, but misaligned 955 // in the 32-bit build. 956 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 957 continue; 958 } 959 960 // Pick up 0, 1 or 2 words from SP+offset. 961 962 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), 963 "scrambled load targets?"); 964 // Load in argument order going down. 965 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize; 966 // Point to interpreter value (vs. tag) 967 int next_off = ld_off - Interpreter::stackElementSize; 968 // 969 // 970 // 971 VMReg r_1 = regs[i].first(); 972 VMReg r_2 = regs[i].second(); 973 if (!r_1->is_valid()) { 974 assert(!r_2->is_valid(), ""); 975 continue; 976 } 977 if (r_1->is_stack()) { 978 // Convert stack slot to an SP offset (+ wordSize to account for return address ) 979 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize; 980 981 // We can use r13 as a temp here because compiled code doesn't need r13 as an input 982 // and if we end up going thru a c2i because of a miss a reasonable value of r13 983 // will be generated. 984 if (!r_2->is_valid()) { 985 // sign extend??? 986 __ movl(r13, Address(saved_sp, ld_off)); 987 __ movptr(Address(rsp, st_off), r13); 988 } else { 989 // 990 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 991 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 992 // So we must adjust where to pick up the data to match the interpreter. 993 // 994 // Interpreter local[n] == MSW, local[n+1] == LSW however locals 995 // are accessed as negative so LSW is at LOW address 996 997 // ld_off is MSW so get LSW 998 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 999 next_off : ld_off; 1000 __ movq(r13, Address(saved_sp, offset)); 1001 // st_off is LSW (i.e. reg.first()) 1002 __ movq(Address(rsp, st_off), r13); 1003 } 1004 } else if (r_1->is_Register()) { // Register argument 1005 Register r = r_1->as_Register(); 1006 assert(r != rax, "must be different"); 1007 if (r_2->is_valid()) { 1008 // 1009 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 1010 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 1011 // So we must adjust where to pick up the data to match the interpreter. 1012 1013 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 1014 next_off : ld_off; 1015 1016 // this can be a misaligned move 1017 __ movq(r, Address(saved_sp, offset)); 1018 } else { 1019 // sign extend and use a full word? 1020 __ movl(r, Address(saved_sp, ld_off)); 1021 } 1022 } else { 1023 if (!r_2->is_valid()) { 1024 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off)); 1025 } else { 1026 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off)); 1027 } 1028 } 1029 } 1030 1031 __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about 1032 1033 // 6243940 We might end up in handle_wrong_method if 1034 // the callee is deoptimized as we race thru here. If that 1035 // happens we don't want to take a safepoint because the 1036 // caller frame will look interpreted and arguments are now 1037 // "compiled" so it is much better to make this transition 1038 // invisible to the stack walking code. Unfortunately if 1039 // we try and find the callee by normal means a safepoint 1040 // is possible. So we stash the desired callee in the thread 1041 // and the vm will find there should this case occur. 1042 1043 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx); 1044 1045 // put Method* where a c2i would expect should we end up there 1046 // only needed because eof c2 resolve stubs return Method* as a result in 1047 // rax 1048 __ mov(rax, rbx); 1049 __ jmp(r11); 1050 } 1051 1052 // --------------------------------------------------------------- 1053 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm, 1054 int total_args_passed, 1055 int comp_args_on_stack, 1056 const BasicType *sig_bt, 1057 const VMRegPair *regs, 1058 AdapterFingerPrint* fingerprint) { 1059 address i2c_entry = __ pc(); 1060 1061 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs); 1062 1063 // ------------------------------------------------------------------------- 1064 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls 1065 // to the interpreter. The args start out packed in the compiled layout. They 1066 // need to be unpacked into the interpreter layout. This will almost always 1067 // require some stack space. We grow the current (compiled) stack, then repack 1068 // the args. We finally end in a jump to the generic interpreter entry point. 1069 // On exit from the interpreter, the interpreter will restore our SP (lest the 1070 // compiled code, which relies solely on SP and not RBP, get sick). 1071 1072 address c2i_unverified_entry = __ pc(); 1073 Label skip_fixup; 1074 1075 Register data = rax; 1076 Register receiver = j_rarg0; 1077 Register temp = rbx; 1078 1079 { 1080 __ ic_check(1 /* end_alignment */); 1081 __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset())); 1082 // Method might have been compiled since the call site was patched to 1083 // interpreted if that is the case treat it as a miss so we can get 1084 // the call site corrected. 1085 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 1086 __ jcc(Assembler::equal, skip_fixup); 1087 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1088 } 1089 1090 address c2i_entry = __ pc(); 1091 1092 // Class initialization barrier for static methods 1093 address c2i_no_clinit_check_entry = nullptr; 1094 if (VM_Version::supports_fast_class_init_checks()) { 1095 Label L_skip_barrier; 1096 Register method = rbx; 1097 1098 { // Bypass the barrier for non-static methods 1099 Register flags = rscratch1; 1100 __ load_unsigned_short(flags, Address(method, Method::access_flags_offset())); 1101 __ testl(flags, JVM_ACC_STATIC); 1102 __ jcc(Assembler::zero, L_skip_barrier); // non-static 1103 } 1104 1105 Register klass = rscratch1; 1106 __ load_method_holder(klass, method); 1107 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1108 1109 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1110 1111 __ bind(L_skip_barrier); 1112 c2i_no_clinit_check_entry = __ pc(); 1113 } 1114 1115 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1116 bs->c2i_entry_barrier(masm); 1117 1118 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup); 1119 1120 return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry); 1121 } 1122 1123 int SharedRuntime::c_calling_convention(const BasicType *sig_bt, 1124 VMRegPair *regs, 1125 int total_args_passed) { 1126 1127 // We return the amount of VMRegImpl stack slots we need to reserve for all 1128 // the arguments NOT counting out_preserve_stack_slots. 1129 1130 // NOTE: These arrays will have to change when c1 is ported 1131 #ifdef _WIN64 1132 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1133 c_rarg0, c_rarg1, c_rarg2, c_rarg3 1134 }; 1135 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1136 c_farg0, c_farg1, c_farg2, c_farg3 1137 }; 1138 #else 1139 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1140 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5 1141 }; 1142 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1143 c_farg0, c_farg1, c_farg2, c_farg3, 1144 c_farg4, c_farg5, c_farg6, c_farg7 1145 }; 1146 #endif // _WIN64 1147 1148 1149 uint int_args = 0; 1150 uint fp_args = 0; 1151 uint stk_args = 0; // inc by 2 each time 1152 1153 for (int i = 0; i < total_args_passed; i++) { 1154 switch (sig_bt[i]) { 1155 case T_BOOLEAN: 1156 case T_CHAR: 1157 case T_BYTE: 1158 case T_SHORT: 1159 case T_INT: 1160 if (int_args < Argument::n_int_register_parameters_c) { 1161 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 1162 #ifdef _WIN64 1163 fp_args++; 1164 // Allocate slots for callee to stuff register args the stack. 1165 stk_args += 2; 1166 #endif 1167 } else { 1168 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1169 stk_args += 2; 1170 } 1171 break; 1172 case T_LONG: 1173 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1174 // fall through 1175 case T_OBJECT: 1176 case T_ARRAY: 1177 case T_ADDRESS: 1178 case T_METADATA: 1179 if (int_args < Argument::n_int_register_parameters_c) { 1180 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 1181 #ifdef _WIN64 1182 fp_args++; 1183 stk_args += 2; 1184 #endif 1185 } else { 1186 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1187 stk_args += 2; 1188 } 1189 break; 1190 case T_FLOAT: 1191 if (fp_args < Argument::n_float_register_parameters_c) { 1192 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 1193 #ifdef _WIN64 1194 int_args++; 1195 // Allocate slots for callee to stuff register args the stack. 1196 stk_args += 2; 1197 #endif 1198 } else { 1199 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1200 stk_args += 2; 1201 } 1202 break; 1203 case T_DOUBLE: 1204 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1205 if (fp_args < Argument::n_float_register_parameters_c) { 1206 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 1207 #ifdef _WIN64 1208 int_args++; 1209 // Allocate slots for callee to stuff register args the stack. 1210 stk_args += 2; 1211 #endif 1212 } else { 1213 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1214 stk_args += 2; 1215 } 1216 break; 1217 case T_VOID: // Halves of longs and doubles 1218 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 1219 regs[i].set_bad(); 1220 break; 1221 default: 1222 ShouldNotReachHere(); 1223 break; 1224 } 1225 } 1226 #ifdef _WIN64 1227 // windows abi requires that we always allocate enough stack space 1228 // for 4 64bit registers to be stored down. 1229 if (stk_args < 8) { 1230 stk_args = 8; 1231 } 1232 #endif // _WIN64 1233 1234 return stk_args; 1235 } 1236 1237 int SharedRuntime::vector_calling_convention(VMRegPair *regs, 1238 uint num_bits, 1239 uint total_args_passed) { 1240 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512, 1241 "only certain vector sizes are supported for now"); 1242 1243 static const XMMRegister VEC_ArgReg[32] = { 1244 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 1245 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1246 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23, 1247 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31 1248 }; 1249 1250 uint stk_args = 0; 1251 uint fp_args = 0; 1252 1253 for (uint i = 0; i < total_args_passed; i++) { 1254 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg(); 1255 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15)); 1256 regs[i].set_pair(vmreg->next(next_val), vmreg); 1257 } 1258 1259 return stk_args; 1260 } 1261 1262 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1263 // We always ignore the frame_slots arg and just use the space just below frame pointer 1264 // which by this time is free to use 1265 switch (ret_type) { 1266 case T_FLOAT: 1267 __ movflt(Address(rbp, -wordSize), xmm0); 1268 break; 1269 case T_DOUBLE: 1270 __ movdbl(Address(rbp, -wordSize), xmm0); 1271 break; 1272 case T_VOID: break; 1273 default: { 1274 __ movptr(Address(rbp, -wordSize), rax); 1275 } 1276 } 1277 } 1278 1279 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1280 // We always ignore the frame_slots arg and just use the space just below frame pointer 1281 // which by this time is free to use 1282 switch (ret_type) { 1283 case T_FLOAT: 1284 __ movflt(xmm0, Address(rbp, -wordSize)); 1285 break; 1286 case T_DOUBLE: 1287 __ movdbl(xmm0, Address(rbp, -wordSize)); 1288 break; 1289 case T_VOID: break; 1290 default: { 1291 __ movptr(rax, Address(rbp, -wordSize)); 1292 } 1293 } 1294 } 1295 1296 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1297 for ( int i = first_arg ; i < arg_count ; i++ ) { 1298 if (args[i].first()->is_Register()) { 1299 __ push(args[i].first()->as_Register()); 1300 } else if (args[i].first()->is_XMMRegister()) { 1301 __ subptr(rsp, 2*wordSize); 1302 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister()); 1303 } 1304 } 1305 } 1306 1307 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1308 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) { 1309 if (args[i].first()->is_Register()) { 1310 __ pop(args[i].first()->as_Register()); 1311 } else if (args[i].first()->is_XMMRegister()) { 1312 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0)); 1313 __ addptr(rsp, 2*wordSize); 1314 } 1315 } 1316 } 1317 1318 static void verify_oop_args(MacroAssembler* masm, 1319 const methodHandle& method, 1320 const BasicType* sig_bt, 1321 const VMRegPair* regs) { 1322 Register temp_reg = rbx; // not part of any compiled calling seq 1323 if (VerifyOops) { 1324 for (int i = 0; i < method->size_of_parameters(); i++) { 1325 if (is_reference_type(sig_bt[i])) { 1326 VMReg r = regs[i].first(); 1327 assert(r->is_valid(), "bad oop arg"); 1328 if (r->is_stack()) { 1329 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1330 __ verify_oop(temp_reg); 1331 } else { 1332 __ verify_oop(r->as_Register()); 1333 } 1334 } 1335 } 1336 } 1337 } 1338 1339 static void check_continuation_enter_argument(VMReg actual_vmreg, 1340 Register expected_reg, 1341 const char* name) { 1342 assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name); 1343 assert(actual_vmreg->as_Register() == expected_reg, 1344 "%s is in unexpected register: %s instead of %s", 1345 name, actual_vmreg->as_Register()->name(), expected_reg->name()); 1346 } 1347 1348 1349 //---------------------------- continuation_enter_setup --------------------------- 1350 // 1351 // Arguments: 1352 // None. 1353 // 1354 // Results: 1355 // rsp: pointer to blank ContinuationEntry 1356 // 1357 // Kills: 1358 // rax 1359 // 1360 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) { 1361 assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, ""); 1362 assert(in_bytes(ContinuationEntry::cont_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1363 assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1364 1365 stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize; 1366 __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1367 1368 int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size; 1369 OopMap* map = new OopMap(frame_size, 0); 1370 1371 __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset())); 1372 __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax); 1373 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp); 1374 1375 return map; 1376 } 1377 1378 //---------------------------- fill_continuation_entry --------------------------- 1379 // 1380 // Arguments: 1381 // rsp: pointer to blank Continuation entry 1382 // reg_cont_obj: pointer to the continuation 1383 // reg_flags: flags 1384 // 1385 // Results: 1386 // rsp: pointer to filled out ContinuationEntry 1387 // 1388 // Kills: 1389 // rax 1390 // 1391 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) { 1392 assert_different_registers(rax, reg_cont_obj, reg_flags); 1393 #ifdef ASSERT 1394 __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value()); 1395 #endif 1396 __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj); 1397 __ movl (Address(rsp, ContinuationEntry::flags_offset()), reg_flags); 1398 __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0); 1399 __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0); 1400 __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0); 1401 1402 __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset())); 1403 __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax); 1404 __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset())); 1405 __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax); 1406 1407 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0); 1408 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0); 1409 } 1410 1411 //---------------------------- continuation_enter_cleanup --------------------------- 1412 // 1413 // Arguments: 1414 // rsp: pointer to the ContinuationEntry 1415 // 1416 // Results: 1417 // rsp: pointer to the spilled rbp in the entry frame 1418 // 1419 // Kills: 1420 // rbx 1421 // 1422 static void continuation_enter_cleanup(MacroAssembler* masm) { 1423 #ifdef ASSERT 1424 Label L_good_sp; 1425 __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1426 __ jcc(Assembler::equal, L_good_sp); 1427 __ stop("Incorrect rsp at continuation_enter_cleanup"); 1428 __ bind(L_good_sp); 1429 #endif 1430 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset())); 1431 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx); 1432 1433 if (CheckJNICalls) { 1434 // Check if this is a virtual thread continuation 1435 Label L_skip_vthread_code; 1436 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0); 1437 __ jcc(Assembler::equal, L_skip_vthread_code); 1438 1439 // If the held monitor count is > 0 and this vthread is terminating then 1440 // it failed to release a JNI monitor. So we issue the same log message 1441 // that JavaThread::exit does. 1442 __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1443 __ jcc(Assembler::equal, L_skip_vthread_code); 1444 1445 // rax may hold an exception oop, save it before the call 1446 __ push(rax); 1447 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held)); 1448 __ pop(rax); 1449 1450 // For vthreads we have to explicitly zero the JNI monitor count of the carrier 1451 // on termination. The held count is implicitly zeroed below when we restore from 1452 // the parent held count (which has to be zero). 1453 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1454 1455 __ bind(L_skip_vthread_code); 1456 } 1457 #ifdef ASSERT 1458 else { 1459 // Check if this is a virtual thread continuation 1460 Label L_skip_vthread_code; 1461 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0); 1462 __ jcc(Assembler::equal, L_skip_vthread_code); 1463 1464 // See comment just above. If not checking JNI calls the JNI count is only 1465 // needed for assertion checking. 1466 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1467 1468 __ bind(L_skip_vthread_code); 1469 } 1470 #endif 1471 1472 __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset())); 1473 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx); 1474 1475 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset())); 1476 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx); 1477 __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1478 } 1479 1480 static void gen_continuation_enter(MacroAssembler* masm, 1481 const VMRegPair* regs, 1482 int& exception_offset, 1483 OopMapSet* oop_maps, 1484 int& frame_complete, 1485 int& stack_slots, 1486 int& interpreted_entry_offset, 1487 int& compiled_entry_offset) { 1488 1489 // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread) 1490 int pos_cont_obj = 0; 1491 int pos_is_cont = 1; 1492 int pos_is_virtual = 2; 1493 1494 // The platform-specific calling convention may present the arguments in various registers. 1495 // To simplify the rest of the code, we expect the arguments to reside at these known 1496 // registers, and we additionally check the placement here in case calling convention ever 1497 // changes. 1498 Register reg_cont_obj = c_rarg1; 1499 Register reg_is_cont = c_rarg2; 1500 Register reg_is_virtual = c_rarg3; 1501 1502 check_continuation_enter_argument(regs[pos_cont_obj].first(), reg_cont_obj, "Continuation object"); 1503 check_continuation_enter_argument(regs[pos_is_cont].first(), reg_is_cont, "isContinue"); 1504 check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread"); 1505 1506 // Utility methods kill rax, make sure there are no collisions 1507 assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual); 1508 1509 AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(), 1510 relocInfo::static_call_type); 1511 1512 address start = __ pc(); 1513 1514 Label L_thaw, L_exit; 1515 1516 // i2i entry used at interp_only_mode only 1517 interpreted_entry_offset = __ pc() - start; 1518 { 1519 #ifdef ASSERT 1520 Label is_interp_only; 1521 __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0); 1522 __ jcc(Assembler::notEqual, is_interp_only); 1523 __ stop("enterSpecial interpreter entry called when not in interp_only_mode"); 1524 __ bind(is_interp_only); 1525 #endif 1526 1527 __ pop(rax); // return address 1528 // Read interpreter arguments into registers (this is an ad-hoc i2c adapter) 1529 __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2)); 1530 __ movl(c_rarg2, Address(rsp, Interpreter::stackElementSize*1)); 1531 __ movl(c_rarg3, Address(rsp, Interpreter::stackElementSize*0)); 1532 __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment 1533 __ push(rax); // return address 1534 __ push_cont_fastpath(); 1535 1536 __ enter(); 1537 1538 stack_slots = 2; // will be adjusted in setup 1539 OopMap* map = continuation_enter_setup(masm, stack_slots); 1540 // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe, 1541 // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway. 1542 1543 __ verify_oop(reg_cont_obj); 1544 1545 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1546 1547 // If continuation, call to thaw. Otherwise, resolve the call and exit. 1548 __ testptr(reg_is_cont, reg_is_cont); 1549 __ jcc(Assembler::notZero, L_thaw); 1550 1551 // --- Resolve path 1552 1553 // Make sure the call is patchable 1554 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1555 // Emit stub for static call 1556 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc()); 1557 if (stub == nullptr) { 1558 fatal("CodeCache is full at gen_continuation_enter"); 1559 } 1560 __ call(resolve); 1561 oop_maps->add_gc_map(__ pc() - start, map); 1562 __ post_call_nop(); 1563 1564 __ jmp(L_exit); 1565 } 1566 1567 // compiled entry 1568 __ align(CodeEntryAlignment); 1569 compiled_entry_offset = __ pc() - start; 1570 __ enter(); 1571 1572 stack_slots = 2; // will be adjusted in setup 1573 OopMap* map = continuation_enter_setup(masm, stack_slots); 1574 1575 // Frame is now completed as far as size and linkage. 1576 frame_complete = __ pc() - start; 1577 1578 __ verify_oop(reg_cont_obj); 1579 1580 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1581 1582 // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue) 1583 __ testptr(reg_is_cont, reg_is_cont); 1584 __ jccb(Assembler::notZero, L_thaw); 1585 1586 // --- call Continuation.enter(Continuation c, boolean isContinue) 1587 1588 // Make sure the call is patchable 1589 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1590 1591 // Emit stub for static call 1592 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc()); 1593 if (stub == nullptr) { 1594 fatal("CodeCache is full at gen_continuation_enter"); 1595 } 1596 1597 // The call needs to be resolved. There's a special case for this in 1598 // SharedRuntime::find_callee_info_helper() which calls 1599 // LinkResolver::resolve_continuation_enter() which resolves the call to 1600 // Continuation.enter(Continuation c, boolean isContinue). 1601 __ call(resolve); 1602 1603 oop_maps->add_gc_map(__ pc() - start, map); 1604 __ post_call_nop(); 1605 1606 __ jmpb(L_exit); 1607 1608 // --- Thawing path 1609 1610 __ bind(L_thaw); 1611 1612 ContinuationEntry::_thaw_call_pc_offset = __ pc() - start; 1613 __ call(RuntimeAddress(StubRoutines::cont_thaw())); 1614 1615 ContinuationEntry::_return_pc_offset = __ pc() - start; 1616 oop_maps->add_gc_map(__ pc() - start, map->deep_copy()); 1617 __ post_call_nop(); 1618 1619 // --- Normal exit (resolve/thawing) 1620 1621 __ bind(L_exit); 1622 ContinuationEntry::_cleanup_offset = __ pc() - start; 1623 continuation_enter_cleanup(masm); 1624 __ pop(rbp); 1625 __ ret(0); 1626 1627 // --- Exception handling path 1628 1629 exception_offset = __ pc() - start; 1630 1631 continuation_enter_cleanup(masm); 1632 __ pop(rbp); 1633 1634 __ movptr(c_rarg0, r15_thread); 1635 __ movptr(c_rarg1, Address(rsp, 0)); // return address 1636 1637 // rax still holds the original exception oop, save it before the call 1638 __ push(rax); 1639 1640 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2); 1641 __ movptr(rbx, rax); 1642 1643 // Continue at exception handler: 1644 // rax: exception oop 1645 // rbx: exception handler 1646 // rdx: exception pc 1647 __ pop(rax); 1648 __ verify_oop(rax); 1649 __ pop(rdx); 1650 __ jmp(rbx); 1651 } 1652 1653 static void gen_continuation_yield(MacroAssembler* masm, 1654 const VMRegPair* regs, 1655 OopMapSet* oop_maps, 1656 int& frame_complete, 1657 int& stack_slots, 1658 int& compiled_entry_offset) { 1659 enum layout { 1660 rbp_off, 1661 rbpH_off, 1662 return_off, 1663 return_off2, 1664 framesize // inclusive of return address 1665 }; 1666 stack_slots = framesize / VMRegImpl::slots_per_word; 1667 assert(stack_slots == 2, "recheck layout"); 1668 1669 address start = __ pc(); 1670 compiled_entry_offset = __ pc() - start; 1671 __ enter(); 1672 address the_pc = __ pc(); 1673 1674 frame_complete = the_pc - start; 1675 1676 // This nop must be exactly at the PC we push into the frame info. 1677 // We use this nop for fast CodeBlob lookup, associate the OopMap 1678 // with it right away. 1679 __ post_call_nop(); 1680 OopMap* map = new OopMap(framesize, 1); 1681 oop_maps->add_gc_map(frame_complete, map); 1682 1683 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 1684 __ movptr(c_rarg0, r15_thread); 1685 __ movptr(c_rarg1, rsp); 1686 __ call_VM_leaf(Continuation::freeze_entry(), 2); 1687 __ reset_last_Java_frame(true); 1688 1689 Label L_pinned; 1690 1691 __ testptr(rax, rax); 1692 __ jcc(Assembler::notZero, L_pinned); 1693 1694 __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1695 continuation_enter_cleanup(masm); 1696 __ pop(rbp); 1697 __ ret(0); 1698 1699 __ bind(L_pinned); 1700 1701 // Pinned, return to caller 1702 1703 // handle pending exception thrown by freeze 1704 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 1705 Label ok; 1706 __ jcc(Assembler::equal, ok); 1707 __ leave(); 1708 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 1709 __ bind(ok); 1710 1711 __ leave(); 1712 __ ret(0); 1713 } 1714 1715 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) { 1716 ::continuation_enter_cleanup(masm); 1717 } 1718 1719 static void gen_special_dispatch(MacroAssembler* masm, 1720 const methodHandle& method, 1721 const BasicType* sig_bt, 1722 const VMRegPair* regs) { 1723 verify_oop_args(masm, method, sig_bt, regs); 1724 vmIntrinsics::ID iid = method->intrinsic_id(); 1725 1726 // Now write the args into the outgoing interpreter space 1727 bool has_receiver = false; 1728 Register receiver_reg = noreg; 1729 int member_arg_pos = -1; 1730 Register member_reg = noreg; 1731 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid); 1732 if (ref_kind != 0) { 1733 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument 1734 member_reg = rbx; // known to be free at this point 1735 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind); 1736 } else if (iid == vmIntrinsics::_invokeBasic) { 1737 has_receiver = true; 1738 } else if (iid == vmIntrinsics::_linkToNative) { 1739 member_arg_pos = method->size_of_parameters() - 1; // trailing NativeEntryPoint argument 1740 member_reg = rbx; // known to be free at this point 1741 } else { 1742 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid)); 1743 } 1744 1745 if (member_reg != noreg) { 1746 // Load the member_arg into register, if necessary. 1747 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs); 1748 VMReg r = regs[member_arg_pos].first(); 1749 if (r->is_stack()) { 1750 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1751 } else { 1752 // no data motion is needed 1753 member_reg = r->as_Register(); 1754 } 1755 } 1756 1757 if (has_receiver) { 1758 // Make sure the receiver is loaded into a register. 1759 assert(method->size_of_parameters() > 0, "oob"); 1760 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object"); 1761 VMReg r = regs[0].first(); 1762 assert(r->is_valid(), "bad receiver arg"); 1763 if (r->is_stack()) { 1764 // Porting note: This assumes that compiled calling conventions always 1765 // pass the receiver oop in a register. If this is not true on some 1766 // platform, pick a temp and load the receiver from stack. 1767 fatal("receiver always in a register"); 1768 receiver_reg = j_rarg0; // known to be free at this point 1769 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1770 } else { 1771 // no data motion is needed 1772 receiver_reg = r->as_Register(); 1773 } 1774 } 1775 1776 // Figure out which address we are really jumping to: 1777 MethodHandles::generate_method_handle_dispatch(masm, iid, 1778 receiver_reg, member_reg, /*for_compiler_entry:*/ true); 1779 } 1780 1781 // --------------------------------------------------------------------------- 1782 // Generate a native wrapper for a given method. The method takes arguments 1783 // in the Java compiled code convention, marshals them to the native 1784 // convention (handlizes oops, etc), transitions to native, makes the call, 1785 // returns to java state (possibly blocking), unhandlizes any result and 1786 // returns. 1787 // 1788 // Critical native functions are a shorthand for the use of 1789 // GetPrimtiveArrayCritical and disallow the use of any other JNI 1790 // functions. The wrapper is expected to unpack the arguments before 1791 // passing them to the callee. Critical native functions leave the state _in_Java, 1792 // since they cannot stop for GC. 1793 // Some other parts of JNI setup are skipped like the tear down of the JNI handle 1794 // block and the check for pending exceptions it's impossible for them 1795 // to be thrown. 1796 // 1797 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm, 1798 const methodHandle& method, 1799 int compile_id, 1800 BasicType* in_sig_bt, 1801 VMRegPair* in_regs, 1802 BasicType ret_type) { 1803 if (method->is_continuation_native_intrinsic()) { 1804 int exception_offset = -1; 1805 OopMapSet* oop_maps = new OopMapSet(); 1806 int frame_complete = -1; 1807 int stack_slots = -1; 1808 int interpreted_entry_offset = -1; 1809 int vep_offset = -1; 1810 if (method->is_continuation_enter_intrinsic()) { 1811 gen_continuation_enter(masm, 1812 in_regs, 1813 exception_offset, 1814 oop_maps, 1815 frame_complete, 1816 stack_slots, 1817 interpreted_entry_offset, 1818 vep_offset); 1819 } else if (method->is_continuation_yield_intrinsic()) { 1820 gen_continuation_yield(masm, 1821 in_regs, 1822 oop_maps, 1823 frame_complete, 1824 stack_slots, 1825 vep_offset); 1826 } else { 1827 guarantee(false, "Unknown Continuation native intrinsic"); 1828 } 1829 1830 #ifdef ASSERT 1831 if (method->is_continuation_enter_intrinsic()) { 1832 assert(interpreted_entry_offset != -1, "Must be set"); 1833 assert(exception_offset != -1, "Must be set"); 1834 } else { 1835 assert(interpreted_entry_offset == -1, "Must be unset"); 1836 assert(exception_offset == -1, "Must be unset"); 1837 } 1838 assert(frame_complete != -1, "Must be set"); 1839 assert(stack_slots != -1, "Must be set"); 1840 assert(vep_offset != -1, "Must be set"); 1841 #endif 1842 1843 __ flush(); 1844 nmethod* nm = nmethod::new_native_nmethod(method, 1845 compile_id, 1846 masm->code(), 1847 vep_offset, 1848 frame_complete, 1849 stack_slots, 1850 in_ByteSize(-1), 1851 in_ByteSize(-1), 1852 oop_maps, 1853 exception_offset); 1854 if (nm == nullptr) return nm; 1855 if (method->is_continuation_enter_intrinsic()) { 1856 ContinuationEntry::set_enter_code(nm, interpreted_entry_offset); 1857 } else if (method->is_continuation_yield_intrinsic()) { 1858 _cont_doYield_stub = nm; 1859 } 1860 return nm; 1861 } 1862 1863 if (method->is_method_handle_intrinsic()) { 1864 vmIntrinsics::ID iid = method->intrinsic_id(); 1865 intptr_t start = (intptr_t)__ pc(); 1866 int vep_offset = ((intptr_t)__ pc()) - start; 1867 gen_special_dispatch(masm, 1868 method, 1869 in_sig_bt, 1870 in_regs); 1871 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period 1872 __ flush(); 1873 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually 1874 return nmethod::new_native_nmethod(method, 1875 compile_id, 1876 masm->code(), 1877 vep_offset, 1878 frame_complete, 1879 stack_slots / VMRegImpl::slots_per_word, 1880 in_ByteSize(-1), 1881 in_ByteSize(-1), 1882 nullptr); 1883 } 1884 address native_func = method->native_function(); 1885 assert(native_func != nullptr, "must have function"); 1886 1887 // An OopMap for lock (and class if static) 1888 OopMapSet *oop_maps = new OopMapSet(); 1889 intptr_t start = (intptr_t)__ pc(); 1890 1891 // We have received a description of where all the java arg are located 1892 // on entry to the wrapper. We need to convert these args to where 1893 // the jni function will expect them. To figure out where they go 1894 // we convert the java signature to a C signature by inserting 1895 // the hidden arguments as arg[0] and possibly arg[1] (static method) 1896 1897 const int total_in_args = method->size_of_parameters(); 1898 int total_c_args = total_in_args + (method->is_static() ? 2 : 1); 1899 1900 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args); 1901 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args); 1902 1903 int argc = 0; 1904 out_sig_bt[argc++] = T_ADDRESS; 1905 if (method->is_static()) { 1906 out_sig_bt[argc++] = T_OBJECT; 1907 } 1908 1909 for (int i = 0; i < total_in_args ; i++ ) { 1910 out_sig_bt[argc++] = in_sig_bt[i]; 1911 } 1912 1913 // Now figure out where the args must be stored and how much stack space 1914 // they require. 1915 int out_arg_slots; 1916 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args); 1917 1918 // Compute framesize for the wrapper. We need to handlize all oops in 1919 // incoming registers 1920 1921 // Calculate the total number of stack slots we will need. 1922 1923 // First count the abi requirement plus all of the outgoing args 1924 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots; 1925 1926 // Now the space for the inbound oop handle area 1927 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers 1928 1929 int oop_handle_offset = stack_slots; 1930 stack_slots += total_save_slots; 1931 1932 // Now any space we need for handlizing a klass if static method 1933 1934 int klass_slot_offset = 0; 1935 int klass_offset = -1; 1936 int lock_slot_offset = 0; 1937 bool is_static = false; 1938 1939 if (method->is_static()) { 1940 klass_slot_offset = stack_slots; 1941 stack_slots += VMRegImpl::slots_per_word; 1942 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size; 1943 is_static = true; 1944 } 1945 1946 // Plus a lock if needed 1947 1948 if (method->is_synchronized()) { 1949 lock_slot_offset = stack_slots; 1950 stack_slots += VMRegImpl::slots_per_word; 1951 } 1952 1953 // Now a place (+2) to save return values or temp during shuffling 1954 // + 4 for return address (which we own) and saved rbp 1955 stack_slots += 6; 1956 1957 // Ok The space we have allocated will look like: 1958 // 1959 // 1960 // FP-> | | 1961 // |---------------------| 1962 // | 2 slots for moves | 1963 // |---------------------| 1964 // | lock box (if sync) | 1965 // |---------------------| <- lock_slot_offset 1966 // | klass (if static) | 1967 // |---------------------| <- klass_slot_offset 1968 // | oopHandle area | 1969 // |---------------------| <- oop_handle_offset (6 java arg registers) 1970 // | outbound memory | 1971 // | based arguments | 1972 // | | 1973 // |---------------------| 1974 // | | 1975 // SP-> | out_preserved_slots | 1976 // 1977 // 1978 1979 1980 // Now compute actual number of stack words we need rounding to make 1981 // stack properly aligned. 1982 stack_slots = align_up(stack_slots, StackAlignmentInSlots); 1983 1984 int stack_size = stack_slots * VMRegImpl::stack_slot_size; 1985 1986 // First thing make an ic check to see if we should even be here 1987 1988 // We are free to use all registers as temps without saving them and 1989 // restoring them except rbp. rbp is the only callee save register 1990 // as far as the interpreter and the compiler(s) are concerned. 1991 1992 const Register receiver = j_rarg0; 1993 1994 Label exception_pending; 1995 1996 assert_different_registers(receiver, rscratch1, rscratch2); 1997 __ verify_oop(receiver); 1998 __ ic_check(8 /* end_alignment */); 1999 2000 int vep_offset = ((intptr_t)__ pc()) - start; 2001 2002 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) { 2003 Label L_skip_barrier; 2004 Register klass = r10; 2005 __ mov_metadata(klass, method->method_holder()); // InstanceKlass* 2006 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 2007 2008 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 2009 2010 __ bind(L_skip_barrier); 2011 } 2012 2013 #ifdef COMPILER1 2014 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available. 2015 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) { 2016 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/); 2017 } 2018 #endif // COMPILER1 2019 2020 // The instruction at the verified entry point must be 5 bytes or longer 2021 // because it can be patched on the fly by make_non_entrant. The stack bang 2022 // instruction fits that requirement. 2023 2024 // Generate stack overflow check 2025 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size()); 2026 2027 // Generate a new frame for the wrapper. 2028 __ enter(); 2029 // -2 because return address is already present and so is saved rbp 2030 __ subptr(rsp, stack_size - 2*wordSize); 2031 2032 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2033 // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub 2034 bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */); 2035 2036 // Frame is now completed as far as size and linkage. 2037 int frame_complete = ((intptr_t)__ pc()) - start; 2038 2039 #ifdef ASSERT 2040 __ check_stack_alignment(rsp, "improperly aligned stack"); 2041 #endif /* ASSERT */ 2042 2043 2044 // We use r14 as the oop handle for the receiver/klass 2045 // It is callee save so it survives the call to native 2046 2047 const Register oop_handle_reg = r14; 2048 2049 // 2050 // We immediately shuffle the arguments so that any vm call we have to 2051 // make from here on out (sync slow path, jvmti, etc.) we will have 2052 // captured the oops from our caller and have a valid oopMap for 2053 // them. 2054 2055 // ----------------- 2056 // The Grand Shuffle 2057 2058 // The Java calling convention is either equal (linux) or denser (win64) than the 2059 // c calling convention. However the because of the jni_env argument the c calling 2060 // convention always has at least one more (and two for static) arguments than Java. 2061 // Therefore if we move the args from java -> c backwards then we will never have 2062 // a register->register conflict and we don't have to build a dependency graph 2063 // and figure out how to break any cycles. 2064 // 2065 2066 // Record esp-based slot for receiver on stack for non-static methods 2067 int receiver_offset = -1; 2068 2069 // This is a trick. We double the stack slots so we can claim 2070 // the oops in the caller's frame. Since we are sure to have 2071 // more args than the caller doubling is enough to make 2072 // sure we can capture all the incoming oop args from the 2073 // caller. 2074 // 2075 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/); 2076 2077 // Mark location of rbp (someday) 2078 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp)); 2079 2080 // Use eax, ebx as temporaries during any memory-memory moves we have to do 2081 // All inbound args are referenced based on rbp and all outbound args via rsp. 2082 2083 2084 #ifdef ASSERT 2085 bool reg_destroyed[Register::number_of_registers]; 2086 bool freg_destroyed[XMMRegister::number_of_registers]; 2087 for ( int r = 0 ; r < Register::number_of_registers ; r++ ) { 2088 reg_destroyed[r] = false; 2089 } 2090 for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) { 2091 freg_destroyed[f] = false; 2092 } 2093 2094 #endif /* ASSERT */ 2095 2096 // For JNI natives the incoming and outgoing registers are offset upwards. 2097 GrowableArray<int> arg_order(2 * total_in_args); 2098 2099 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 2100 arg_order.push(i); 2101 arg_order.push(c_arg); 2102 } 2103 2104 for (int ai = 0; ai < arg_order.length(); ai += 2) { 2105 int i = arg_order.at(ai); 2106 int c_arg = arg_order.at(ai + 1); 2107 __ block_comment(err_msg("move %d -> %d", i, c_arg)); 2108 #ifdef ASSERT 2109 if (in_regs[i].first()->is_Register()) { 2110 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!"); 2111 } else if (in_regs[i].first()->is_XMMRegister()) { 2112 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!"); 2113 } 2114 if (out_regs[c_arg].first()->is_Register()) { 2115 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 2116 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 2117 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 2118 } 2119 #endif /* ASSERT */ 2120 switch (in_sig_bt[i]) { 2121 case T_ARRAY: 2122 case T_OBJECT: 2123 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg], 2124 ((i == 0) && (!is_static)), 2125 &receiver_offset); 2126 break; 2127 case T_VOID: 2128 break; 2129 2130 case T_FLOAT: 2131 __ float_move(in_regs[i], out_regs[c_arg]); 2132 break; 2133 2134 case T_DOUBLE: 2135 assert( i + 1 < total_in_args && 2136 in_sig_bt[i + 1] == T_VOID && 2137 out_sig_bt[c_arg+1] == T_VOID, "bad arg list"); 2138 __ double_move(in_regs[i], out_regs[c_arg]); 2139 break; 2140 2141 case T_LONG : 2142 __ long_move(in_regs[i], out_regs[c_arg]); 2143 break; 2144 2145 case T_ADDRESS: assert(false, "found T_ADDRESS in java args"); 2146 2147 default: 2148 __ move32_64(in_regs[i], out_regs[c_arg]); 2149 } 2150 } 2151 2152 int c_arg; 2153 2154 // Pre-load a static method's oop into r14. Used both by locking code and 2155 // the normal JNI call code. 2156 // point c_arg at the first arg that is already loaded in case we 2157 // need to spill before we call out 2158 c_arg = total_c_args - total_in_args; 2159 2160 if (method->is_static()) { 2161 2162 // load oop into a register 2163 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror())); 2164 2165 // Now handlize the static class mirror it's known not-null. 2166 __ movptr(Address(rsp, klass_offset), oop_handle_reg); 2167 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset)); 2168 2169 // Now get the handle 2170 __ lea(oop_handle_reg, Address(rsp, klass_offset)); 2171 // store the klass handle as second argument 2172 __ movptr(c_rarg1, oop_handle_reg); 2173 // and protect the arg if we must spill 2174 c_arg--; 2175 } 2176 2177 // Change state to native (we save the return address in the thread, since it might not 2178 // be pushed on the stack when we do a stack traversal). It is enough that the pc() 2179 // points into the right code segment. It does not have to be the correct return pc. 2180 // We use the same pc/oopMap repeatedly when we call out 2181 2182 Label native_return; 2183 if (LockingMode != LM_LEGACY && method->is_object_wait0()) { 2184 // For convenience we use the pc we want to resume to in case of preemption on Object.wait. 2185 __ set_last_Java_frame(rsp, noreg, native_return, rscratch1); 2186 } else { 2187 intptr_t the_pc = (intptr_t) __ pc(); 2188 oop_maps->add_gc_map(the_pc - start, map); 2189 2190 __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1); 2191 } 2192 2193 // We have all of the arguments setup at this point. We must not touch any register 2194 // argument registers at this point (what if we save/restore them there are no oop? 2195 2196 if (DTraceMethodProbes) { 2197 // protect the args we've loaded 2198 save_args(masm, total_c_args, c_arg, out_regs); 2199 __ mov_metadata(c_rarg1, method()); 2200 __ call_VM_leaf( 2201 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry), 2202 r15_thread, c_rarg1); 2203 restore_args(masm, total_c_args, c_arg, out_regs); 2204 } 2205 2206 // RedefineClasses() tracing support for obsolete method entry 2207 if (log_is_enabled(Trace, redefine, class, obsolete)) { 2208 // protect the args we've loaded 2209 save_args(masm, total_c_args, c_arg, out_regs); 2210 __ mov_metadata(c_rarg1, method()); 2211 __ call_VM_leaf( 2212 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry), 2213 r15_thread, c_rarg1); 2214 restore_args(masm, total_c_args, c_arg, out_regs); 2215 } 2216 2217 // Lock a synchronized method 2218 2219 // Register definitions used by locking and unlocking 2220 2221 const Register swap_reg = rax; // Must use rax for cmpxchg instruction 2222 const Register obj_reg = rbx; // Will contain the oop 2223 const Register lock_reg = r13; // Address of compiler lock object (BasicLock) 2224 const Register old_hdr = r13; // value of old header at unlock time 2225 2226 Label slow_path_lock; 2227 Label lock_done; 2228 2229 if (method->is_synchronized()) { 2230 Label count_mon; 2231 2232 const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes(); 2233 2234 // Get the handle (the 2nd argument) 2235 __ mov(oop_handle_reg, c_rarg1); 2236 2237 // Get address of the box 2238 2239 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2240 2241 // Load the oop from the handle 2242 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2243 2244 if (LockingMode == LM_MONITOR) { 2245 __ jmp(slow_path_lock); 2246 } else if (LockingMode == LM_LEGACY) { 2247 // Load immediate 1 into swap_reg %rax 2248 __ movl(swap_reg, 1); 2249 2250 // Load (object->mark() | 1) into swap_reg %rax 2251 __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2252 2253 // Save (object->mark() | 1) into BasicLock's displaced header 2254 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2255 2256 // src -> dest iff dest == rax else rax <- dest 2257 __ lock(); 2258 __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2259 __ jcc(Assembler::equal, count_mon); 2260 2261 // Hmm should this move to the slow path code area??? 2262 2263 // Test if the oopMark is an obvious stack pointer, i.e., 2264 // 1) (mark & 3) == 0, and 2265 // 2) rsp <= mark < mark + os::pagesize() 2266 // These 3 tests can be done by evaluating the following 2267 // expression: ((mark - rsp) & (3 - os::vm_page_size())), 2268 // assuming both stack pointer and pagesize have their 2269 // least significant 2 bits clear. 2270 // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg 2271 2272 __ subptr(swap_reg, rsp); 2273 __ andptr(swap_reg, 3 - (int)os::vm_page_size()); 2274 2275 // Save the test result, for recursive case, the result is zero 2276 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2277 __ jcc(Assembler::notEqual, slow_path_lock); 2278 2279 __ bind(count_mon); 2280 __ inc_held_monitor_count(); 2281 } else { 2282 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2283 __ lightweight_lock(lock_reg, obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock); 2284 } 2285 2286 // Slow path will re-enter here 2287 __ bind(lock_done); 2288 } 2289 2290 // Finally just about ready to make the JNI call 2291 2292 // get JNIEnv* which is first argument to native 2293 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset()))); 2294 2295 // Now set thread in native 2296 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 2297 2298 __ call(RuntimeAddress(native_func)); 2299 2300 // Verify or restore cpu control state after JNI call 2301 __ restore_cpu_control_state_after_jni(rscratch1); 2302 2303 // Unpack native results. 2304 switch (ret_type) { 2305 case T_BOOLEAN: __ c2bool(rax); break; 2306 case T_CHAR : __ movzwl(rax, rax); break; 2307 case T_BYTE : __ sign_extend_byte (rax); break; 2308 case T_SHORT : __ sign_extend_short(rax); break; 2309 case T_INT : /* nothing to do */ break; 2310 case T_DOUBLE : 2311 case T_FLOAT : 2312 // Result is in xmm0 we'll save as needed 2313 break; 2314 case T_ARRAY: // Really a handle 2315 case T_OBJECT: // Really a handle 2316 break; // can't de-handlize until after safepoint check 2317 case T_VOID: break; 2318 case T_LONG: break; 2319 default : ShouldNotReachHere(); 2320 } 2321 2322 // Switch thread to "native transition" state before reading the synchronization state. 2323 // This additional state is necessary because reading and testing the synchronization 2324 // state is not atomic w.r.t. GC, as this scenario demonstrates: 2325 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted. 2326 // VM thread changes sync state to synchronizing and suspends threads for GC. 2327 // Thread A is resumed to finish this native method, but doesn't block here since it 2328 // didn't see any synchronization is progress, and escapes. 2329 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 2330 2331 // Force this write out before the read below 2332 if (!UseSystemMemoryBarrier) { 2333 __ membar(Assembler::Membar_mask_bits( 2334 Assembler::LoadLoad | Assembler::LoadStore | 2335 Assembler::StoreLoad | Assembler::StoreStore)); 2336 } 2337 2338 // check for safepoint operation in progress and/or pending suspend requests 2339 { 2340 Label Continue; 2341 Label slow_path; 2342 2343 __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */); 2344 2345 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2346 __ jcc(Assembler::equal, Continue); 2347 __ bind(slow_path); 2348 2349 // Don't use call_VM as it will see a possible pending exception and forward it 2350 // and never return here preventing us from clearing _last_native_pc down below. 2351 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are 2352 // preserved and correspond to the bcp/locals pointers. So we do a runtime call 2353 // by hand. 2354 // 2355 __ vzeroupper(); 2356 save_native_result(masm, ret_type, stack_slots); 2357 __ mov(c_rarg0, r15_thread); 2358 __ mov(r12, rsp); // remember sp 2359 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2360 __ andptr(rsp, -16); // align stack as required by ABI 2361 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 2362 __ mov(rsp, r12); // restore sp 2363 __ reinit_heapbase(); 2364 // Restore any method result value 2365 restore_native_result(masm, ret_type, stack_slots); 2366 __ bind(Continue); 2367 } 2368 2369 // change thread state 2370 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 2371 2372 if (LockingMode != LM_LEGACY && method->is_object_wait0()) { 2373 // Check preemption for Object.wait() 2374 __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset())); 2375 __ cmpptr(rscratch1, NULL_WORD); 2376 __ jccb(Assembler::equal, native_return); 2377 __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD); 2378 __ jmp(rscratch1); 2379 __ bind(native_return); 2380 2381 intptr_t the_pc = (intptr_t) __ pc(); 2382 oop_maps->add_gc_map(the_pc - start, map); 2383 } 2384 2385 2386 Label reguard; 2387 Label reguard_done; 2388 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 2389 __ jcc(Assembler::equal, reguard); 2390 __ bind(reguard_done); 2391 2392 // native result if any is live 2393 2394 // Unlock 2395 Label slow_path_unlock; 2396 Label unlock_done; 2397 if (method->is_synchronized()) { 2398 2399 Label fast_done; 2400 2401 // Get locked oop from the handle we passed to jni 2402 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2403 2404 if (LockingMode == LM_LEGACY) { 2405 Label not_recur; 2406 // Simple recursive lock? 2407 __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD); 2408 __ jcc(Assembler::notEqual, not_recur); 2409 __ dec_held_monitor_count(); 2410 __ jmpb(fast_done); 2411 __ bind(not_recur); 2412 } 2413 2414 // Must save rax if it is live now because cmpxchg must use it 2415 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2416 save_native_result(masm, ret_type, stack_slots); 2417 } 2418 2419 if (LockingMode == LM_MONITOR) { 2420 __ jmp(slow_path_unlock); 2421 } else if (LockingMode == LM_LEGACY) { 2422 // get address of the stack lock 2423 __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2424 // get old displaced header 2425 __ movptr(old_hdr, Address(rax, 0)); 2426 2427 // Atomic swap old header if oop still contains the stack lock 2428 __ lock(); 2429 __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2430 __ jcc(Assembler::notEqual, slow_path_unlock); 2431 __ dec_held_monitor_count(); 2432 } else { 2433 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2434 __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock); 2435 } 2436 2437 // slow path re-enters here 2438 __ bind(unlock_done); 2439 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2440 restore_native_result(masm, ret_type, stack_slots); 2441 } 2442 2443 __ bind(fast_done); 2444 } 2445 if (DTraceMethodProbes) { 2446 save_native_result(masm, ret_type, stack_slots); 2447 __ mov_metadata(c_rarg1, method()); 2448 __ call_VM_leaf( 2449 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit), 2450 r15_thread, c_rarg1); 2451 restore_native_result(masm, ret_type, stack_slots); 2452 } 2453 2454 __ reset_last_Java_frame(false); 2455 2456 // Unbox oop result, e.g. JNIHandles::resolve value. 2457 if (is_reference_type(ret_type)) { 2458 __ resolve_jobject(rax /* value */, 2459 r15_thread /* thread */, 2460 rcx /* tmp */); 2461 } 2462 2463 if (CheckJNICalls) { 2464 // clear_pending_jni_exception_check 2465 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD); 2466 } 2467 2468 // reset handle block 2469 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset())); 2470 __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD); 2471 2472 // pop our frame 2473 2474 __ leave(); 2475 2476 // Any exception pending? 2477 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2478 __ jcc(Assembler::notEqual, exception_pending); 2479 2480 // Return 2481 2482 __ ret(0); 2483 2484 // Unexpected paths are out of line and go here 2485 2486 // forward the exception 2487 __ bind(exception_pending); 2488 2489 // and forward the exception 2490 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 2491 2492 // Slow path locking & unlocking 2493 if (method->is_synchronized()) { 2494 2495 // BEGIN Slow path lock 2496 __ bind(slow_path_lock); 2497 2498 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM 2499 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2500 2501 // protect the args we've loaded 2502 save_args(masm, total_c_args, c_arg, out_regs); 2503 2504 __ mov(c_rarg0, obj_reg); 2505 __ mov(c_rarg1, lock_reg); 2506 __ mov(c_rarg2, r15_thread); 2507 2508 // Not a leaf but we have last_Java_frame setup as we want. 2509 // We don't want to unmount in case of contention since that would complicate preserving 2510 // the arguments that had already been marshalled into the native convention. So we force 2511 // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame()) 2512 // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack. 2513 __ push_cont_fastpath(); 2514 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3); 2515 __ pop_cont_fastpath(); 2516 restore_args(masm, total_c_args, c_arg, out_regs); 2517 2518 #ifdef ASSERT 2519 { Label L; 2520 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2521 __ jcc(Assembler::equal, L); 2522 __ stop("no pending exception allowed on exit from monitorenter"); 2523 __ bind(L); 2524 } 2525 #endif 2526 __ jmp(lock_done); 2527 2528 // END Slow path lock 2529 2530 // BEGIN Slow path unlock 2531 __ bind(slow_path_unlock); 2532 2533 // If we haven't already saved the native result we must save it now as xmm registers 2534 // are still exposed. 2535 __ vzeroupper(); 2536 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2537 save_native_result(masm, ret_type, stack_slots); 2538 } 2539 2540 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2541 2542 __ mov(c_rarg0, obj_reg); 2543 __ mov(c_rarg2, r15_thread); 2544 __ mov(r12, rsp); // remember sp 2545 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2546 __ andptr(rsp, -16); // align stack as required by ABI 2547 2548 // Save pending exception around call to VM (which contains an EXCEPTION_MARK) 2549 // NOTE that obj_reg == rbx currently 2550 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset()))); 2551 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2552 2553 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2554 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C))); 2555 __ mov(rsp, r12); // restore sp 2556 __ reinit_heapbase(); 2557 #ifdef ASSERT 2558 { 2559 Label L; 2560 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2561 __ jcc(Assembler::equal, L); 2562 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C"); 2563 __ bind(L); 2564 } 2565 #endif /* ASSERT */ 2566 2567 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx); 2568 2569 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2570 restore_native_result(masm, ret_type, stack_slots); 2571 } 2572 __ jmp(unlock_done); 2573 2574 // END Slow path unlock 2575 2576 } // synchronized 2577 2578 // SLOW PATH Reguard the stack if needed 2579 2580 __ bind(reguard); 2581 __ vzeroupper(); 2582 save_native_result(masm, ret_type, stack_slots); 2583 __ mov(r12, rsp); // remember sp 2584 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2585 __ andptr(rsp, -16); // align stack as required by ABI 2586 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 2587 __ mov(rsp, r12); // restore sp 2588 __ reinit_heapbase(); 2589 restore_native_result(masm, ret_type, stack_slots); 2590 // and continue 2591 __ jmp(reguard_done); 2592 2593 2594 2595 __ flush(); 2596 2597 nmethod *nm = nmethod::new_native_nmethod(method, 2598 compile_id, 2599 masm->code(), 2600 vep_offset, 2601 frame_complete, 2602 stack_slots / VMRegImpl::slots_per_word, 2603 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)), 2604 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size), 2605 oop_maps); 2606 2607 return nm; 2608 } 2609 2610 // this function returns the adjust size (in number of words) to a c2i adapter 2611 // activation for use during deoptimization 2612 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) { 2613 return (callee_locals - callee_parameters) * Interpreter::stackElementWords; 2614 } 2615 2616 2617 uint SharedRuntime::out_preserve_stack_slots() { 2618 return 0; 2619 } 2620 2621 2622 // Number of stack slots between incoming argument block and the start of 2623 // a new frame. The PROLOG must add this many slots to the stack. The 2624 // EPILOG must remove this many slots. amd64 needs two slots for 2625 // return address. 2626 uint SharedRuntime::in_preserve_stack_slots() { 2627 return 4 + 2 * VerifyStackAtCalls; 2628 } 2629 2630 VMReg SharedRuntime::thread_register() { 2631 return r15_thread->as_VMReg(); 2632 } 2633 2634 //------------------------------generate_deopt_blob---------------------------- 2635 void SharedRuntime::generate_deopt_blob() { 2636 // Allocate space for the code 2637 ResourceMark rm; 2638 // Setup code generation tools 2639 int pad = 0; 2640 if (UseAVX > 2) { 2641 pad += 1024; 2642 } 2643 if (UseAPX) { 2644 pad += 1024; 2645 } 2646 #if INCLUDE_JVMCI 2647 if (EnableJVMCI) { 2648 pad += 512; // Increase the buffer size when compiling for JVMCI 2649 } 2650 #endif 2651 const char* name = SharedRuntime::stub_name(SharedStubId::deopt_id); 2652 CodeBuffer buffer(name, 2560+pad, 1024); 2653 MacroAssembler* masm = new MacroAssembler(&buffer); 2654 int frame_size_in_words; 2655 OopMap* map = nullptr; 2656 OopMapSet *oop_maps = new OopMapSet(); 2657 2658 // ------------- 2659 // This code enters when returning to a de-optimized nmethod. A return 2660 // address has been pushed on the stack, and return values are in 2661 // registers. 2662 // If we are doing a normal deopt then we were called from the patched 2663 // nmethod from the point we returned to the nmethod. So the return 2664 // address on the stack is wrong by NativeCall::instruction_size 2665 // We will adjust the value so it looks like we have the original return 2666 // address on the stack (like when we eagerly deoptimized). 2667 // In the case of an exception pending when deoptimizing, we enter 2668 // with a return address on the stack that points after the call we patched 2669 // into the exception handler. We have the following register state from, 2670 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp). 2671 // rax: exception oop 2672 // rbx: exception handler 2673 // rdx: throwing pc 2674 // So in this case we simply jam rdx into the useless return address and 2675 // the stack looks just like we want. 2676 // 2677 // At this point we need to de-opt. We save the argument return 2678 // registers. We call the first C routine, fetch_unroll_info(). This 2679 // routine captures the return values and returns a structure which 2680 // describes the current frame size and the sizes of all replacement frames. 2681 // The current frame is compiled code and may contain many inlined 2682 // functions, each with their own JVM state. We pop the current frame, then 2683 // push all the new frames. Then we call the C routine unpack_frames() to 2684 // populate these frames. Finally unpack_frames() returns us the new target 2685 // address. Notice that callee-save registers are BLOWN here; they have 2686 // already been captured in the vframeArray at the time the return PC was 2687 // patched. 2688 address start = __ pc(); 2689 Label cont; 2690 2691 // Prolog for non exception case! 2692 2693 // Save everything in sight. 2694 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2695 2696 // Normal deoptimization. Save exec mode for unpack_frames. 2697 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved 2698 __ jmp(cont); 2699 2700 int reexecute_offset = __ pc() - start; 2701 #if INCLUDE_JVMCI && !defined(COMPILER1) 2702 if (UseJVMCICompiler) { 2703 // JVMCI does not use this kind of deoptimization 2704 __ should_not_reach_here(); 2705 } 2706 #endif 2707 2708 // Reexecute case 2709 // return address is the pc describes what bci to do re-execute at 2710 2711 // No need to update map as each call to save_live_registers will produce identical oopmap 2712 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2713 2714 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved 2715 __ jmp(cont); 2716 2717 #if INCLUDE_JVMCI 2718 Label after_fetch_unroll_info_call; 2719 int implicit_exception_uncommon_trap_offset = 0; 2720 int uncommon_trap_offset = 0; 2721 2722 if (EnableJVMCI) { 2723 implicit_exception_uncommon_trap_offset = __ pc() - start; 2724 2725 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset()))); 2726 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD); 2727 2728 uncommon_trap_offset = __ pc() - start; 2729 2730 // Save everything in sight. 2731 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2732 // fetch_unroll_info needs to call last_java_frame() 2733 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2734 2735 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset()))); 2736 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1); 2737 2738 __ movl(r14, Deoptimization::Unpack_reexecute); 2739 __ mov(c_rarg0, r15_thread); 2740 __ movl(c_rarg2, r14); // exec mode 2741 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2742 oop_maps->add_gc_map( __ pc()-start, map->deep_copy()); 2743 2744 __ reset_last_Java_frame(false); 2745 2746 __ jmp(after_fetch_unroll_info_call); 2747 } // EnableJVMCI 2748 #endif // INCLUDE_JVMCI 2749 2750 int exception_offset = __ pc() - start; 2751 2752 // Prolog for exception case 2753 2754 // all registers are dead at this entry point, except for rax, and 2755 // rdx which contain the exception oop and exception pc 2756 // respectively. Set them in TLS and fall thru to the 2757 // unpack_with_exception_in_tls entry point. 2758 2759 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 2760 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax); 2761 2762 int exception_in_tls_offset = __ pc() - start; 2763 2764 // new implementation because exception oop is now passed in JavaThread 2765 2766 // Prolog for exception case 2767 // All registers must be preserved because they might be used by LinearScan 2768 // Exceptiop oop and throwing PC are passed in JavaThread 2769 // tos: stack at point of call to method that threw the exception (i.e. only 2770 // args are on the stack, no return address) 2771 2772 // make room on stack for the return address 2773 // It will be patched later with the throwing pc. The correct value is not 2774 // available now because loading it from memory would destroy registers. 2775 __ push(0); 2776 2777 // Save everything in sight. 2778 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2779 2780 // Now it is safe to overwrite any register 2781 2782 // Deopt during an exception. Save exec mode for unpack_frames. 2783 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved 2784 2785 // load throwing pc from JavaThread and patch it as the return address 2786 // of the current frame. Then clear the field in JavaThread 2787 2788 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2789 __ movptr(Address(rbp, wordSize), rdx); 2790 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2791 2792 #ifdef ASSERT 2793 // verify that there is really an exception oop in JavaThread 2794 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2795 __ verify_oop(rax); 2796 2797 // verify that there is no pending exception 2798 Label no_pending_exception; 2799 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 2800 __ testptr(rax, rax); 2801 __ jcc(Assembler::zero, no_pending_exception); 2802 __ stop("must not have pending exception here"); 2803 __ bind(no_pending_exception); 2804 #endif 2805 2806 __ bind(cont); 2807 2808 // Call C code. Need thread and this frame, but NOT official VM entry 2809 // crud. We cannot block on this call, no GC can happen. 2810 // 2811 // UnrollBlock* fetch_unroll_info(JavaThread* thread) 2812 2813 // fetch_unroll_info needs to call last_java_frame(). 2814 2815 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2816 #ifdef ASSERT 2817 { Label L; 2818 __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD); 2819 __ jcc(Assembler::equal, L); 2820 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared"); 2821 __ bind(L); 2822 } 2823 #endif // ASSERT 2824 __ mov(c_rarg0, r15_thread); 2825 __ movl(c_rarg1, r14); // exec_mode 2826 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info))); 2827 2828 // Need to have an oopmap that tells fetch_unroll_info where to 2829 // find any register it might need. 2830 oop_maps->add_gc_map(__ pc() - start, map); 2831 2832 __ reset_last_Java_frame(false); 2833 2834 #if INCLUDE_JVMCI 2835 if (EnableJVMCI) { 2836 __ bind(after_fetch_unroll_info_call); 2837 } 2838 #endif 2839 2840 // Load UnrollBlock* into rdi 2841 __ mov(rdi, rax); 2842 2843 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset())); 2844 Label noException; 2845 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending? 2846 __ jcc(Assembler::notEqual, noException); 2847 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2848 // QQQ this is useless it was null above 2849 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2850 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD); 2851 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2852 2853 __ verify_oop(rax); 2854 2855 // Overwrite the result registers with the exception results. 2856 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2857 // I think this is useless 2858 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx); 2859 2860 __ bind(noException); 2861 2862 // Only register save data is on the stack. 2863 // Now restore the result registers. Everything else is either dead 2864 // or captured in the vframeArray. 2865 RegisterSaver::restore_result_registers(masm); 2866 2867 // All of the register save area has been popped of the stack. Only the 2868 // return address remains. 2869 2870 // Pop all the frames we must move/replace. 2871 // 2872 // Frame picture (youngest to oldest) 2873 // 1: self-frame (no frame link) 2874 // 2: deopting frame (no frame link) 2875 // 3: caller of deopting frame (could be compiled/interpreted). 2876 // 2877 // Note: by leaving the return address of self-frame on the stack 2878 // and using the size of frame 2 to adjust the stack 2879 // when we are done the return to frame 3 will still be on the stack. 2880 2881 // Pop deoptimized frame 2882 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset())); 2883 __ addptr(rsp, rcx); 2884 2885 // rsp should be pointing at the return address to the caller (3) 2886 2887 // Pick up the initial fp we should save 2888 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2889 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset())); 2890 2891 #ifdef ASSERT 2892 // Compilers generate code that bang the stack by as much as the 2893 // interpreter would need. So this stack banging should never 2894 // trigger a fault. Verify that it does not on non product builds. 2895 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset())); 2896 __ bang_stack_size(rbx, rcx); 2897 #endif 2898 2899 // Load address of array of frame pcs into rcx 2900 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset())); 2901 2902 // Trash the old pc 2903 __ addptr(rsp, wordSize); 2904 2905 // Load address of array of frame sizes into rsi 2906 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset())); 2907 2908 // Load counter into rdx 2909 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset())); 2910 2911 // Now adjust the caller's stack to make up for the extra locals 2912 // but record the original sp so that we can save it in the skeletal interpreter 2913 // frame and the stack walking of interpreter_sender will get the unextended sp 2914 // value and not the "real" sp value. 2915 2916 const Register sender_sp = r8; 2917 2918 __ mov(sender_sp, rsp); 2919 __ movl(rbx, Address(rdi, 2920 Deoptimization::UnrollBlock:: 2921 caller_adjustment_offset())); 2922 __ subptr(rsp, rbx); 2923 2924 // Push interpreter frames in a loop 2925 Label loop; 2926 __ bind(loop); 2927 __ movptr(rbx, Address(rsi, 0)); // Load frame size 2928 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand 2929 __ pushptr(Address(rcx, 0)); // Save return address 2930 __ enter(); // Save old & set new ebp 2931 __ subptr(rsp, rbx); // Prolog 2932 // This value is corrected by layout_activation_impl 2933 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD); 2934 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable 2935 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 2936 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 2937 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 2938 __ decrementl(rdx); // Decrement counter 2939 __ jcc(Assembler::notZero, loop); 2940 __ pushptr(Address(rcx, 0)); // Save final return address 2941 2942 // Re-push self-frame 2943 __ enter(); // Save old & set new ebp 2944 2945 // Allocate a full sized register save area. 2946 // Return address and rbp are in place, so we allocate two less words. 2947 __ subptr(rsp, (frame_size_in_words - 2) * wordSize); 2948 2949 // Restore frame locals after moving the frame 2950 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0); 2951 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2952 2953 // Call C code. Need thread but NOT official VM entry 2954 // crud. We cannot block on this call, no GC can happen. Call should 2955 // restore return values to their stack-slots with the new SP. 2956 // 2957 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode) 2958 2959 // Use rbp because the frames look interpreted now 2960 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 2961 // Don't need the precise return PC here, just precise enough to point into this code blob. 2962 address the_pc = __ pc(); 2963 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1); 2964 2965 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI 2966 __ mov(c_rarg0, r15_thread); 2967 __ movl(c_rarg1, r14); // second arg: exec_mode 2968 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 2969 // Revert SP alignment after call since we're going to do some SP relative addressing below 2970 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset())); 2971 2972 // Set an oopmap for the call site 2973 // Use the same PC we used for the last java frame 2974 oop_maps->add_gc_map(the_pc - start, 2975 new OopMap( frame_size_in_words, 0 )); 2976 2977 // Clear fp AND pc 2978 __ reset_last_Java_frame(true); 2979 2980 // Collect return values 2981 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes())); 2982 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes())); 2983 // I think this is useless (throwing pc?) 2984 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes())); 2985 2986 // Pop self-frame. 2987 __ leave(); // Epilog 2988 2989 // Jump to interpreter 2990 __ ret(0); 2991 2992 // Make sure all code is generated 2993 masm->flush(); 2994 2995 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words); 2996 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset); 2997 #if INCLUDE_JVMCI 2998 if (EnableJVMCI) { 2999 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset); 3000 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset); 3001 } 3002 #endif 3003 } 3004 3005 //------------------------------generate_handler_blob------ 3006 // 3007 // Generate a special Compile2Runtime blob that saves all registers, 3008 // and setup oopmap. 3009 // 3010 SafepointBlob* SharedRuntime::generate_handler_blob(SharedStubId id, address call_ptr) { 3011 assert(StubRoutines::forward_exception_entry() != nullptr, 3012 "must be generated before"); 3013 assert(is_polling_page_id(id), "expected a polling page stub id"); 3014 3015 ResourceMark rm; 3016 OopMapSet *oop_maps = new OopMapSet(); 3017 OopMap* map; 3018 3019 // Allocate space for the code. Setup code generation tools. 3020 const char* name = SharedRuntime::stub_name(id); 3021 CodeBuffer buffer(name, 2548, 1024); 3022 MacroAssembler* masm = new MacroAssembler(&buffer); 3023 3024 address start = __ pc(); 3025 address call_pc = nullptr; 3026 int frame_size_in_words; 3027 bool cause_return = (id == SharedStubId::polling_page_return_handler_id); 3028 bool save_wide_vectors = (id == SharedStubId::polling_page_vectors_safepoint_handler_id); 3029 3030 // Make room for return address (or push it again) 3031 if (!cause_return) { 3032 __ push(rbx); 3033 } 3034 3035 // Save registers, fpu state, and flags 3036 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors); 3037 3038 // The following is basically a call_VM. However, we need the precise 3039 // address of the call in order to generate an oopmap. Hence, we do all the 3040 // work ourselves. 3041 3042 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next: 3043 3044 // The return address must always be correct so that frame constructor never 3045 // sees an invalid pc. 3046 3047 if (!cause_return) { 3048 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack. 3049 // Additionally, rbx is a callee saved register and we can look at it later to determine 3050 // if someone changed the return address for us! 3051 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset())); 3052 __ movptr(Address(rbp, wordSize), rbx); 3053 } 3054 3055 // Do the call 3056 __ mov(c_rarg0, r15_thread); 3057 __ call(RuntimeAddress(call_ptr)); 3058 3059 // Set an oopmap for the call site. This oopmap will map all 3060 // oop-registers and debug-info registers as callee-saved. This 3061 // will allow deoptimization at this safepoint to find all possible 3062 // debug-info recordings, as well as let GC find all oops. 3063 3064 oop_maps->add_gc_map( __ pc() - start, map); 3065 3066 Label noException; 3067 3068 __ reset_last_Java_frame(false); 3069 3070 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3071 __ jcc(Assembler::equal, noException); 3072 3073 // Exception pending 3074 3075 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3076 3077 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3078 3079 // No exception case 3080 __ bind(noException); 3081 3082 Label no_adjust; 3083 #ifdef ASSERT 3084 Label bail; 3085 #endif 3086 if (!cause_return) { 3087 Label no_prefix, not_special, check_rex_prefix; 3088 3089 // If our stashed return pc was modified by the runtime we avoid touching it 3090 __ cmpptr(rbx, Address(rbp, wordSize)); 3091 __ jcc(Assembler::notEqual, no_adjust); 3092 3093 // Skip over the poll instruction. 3094 // See NativeInstruction::is_safepoint_poll() 3095 // Possible encodings: 3096 // 85 00 test %eax,(%rax) 3097 // 85 01 test %eax,(%rcx) 3098 // 85 02 test %eax,(%rdx) 3099 // 85 03 test %eax,(%rbx) 3100 // 85 06 test %eax,(%rsi) 3101 // 85 07 test %eax,(%rdi) 3102 // 3103 // 41 85 00 test %eax,(%r8) 3104 // 41 85 01 test %eax,(%r9) 3105 // 41 85 02 test %eax,(%r10) 3106 // 41 85 03 test %eax,(%r11) 3107 // 41 85 06 test %eax,(%r14) 3108 // 41 85 07 test %eax,(%r15) 3109 // 3110 // 85 04 24 test %eax,(%rsp) 3111 // 41 85 04 24 test %eax,(%r12) 3112 // 85 45 00 test %eax,0x0(%rbp) 3113 // 41 85 45 00 test %eax,0x0(%r13) 3114 // 3115 // Notes: 3116 // Format of legacy MAP0 test instruction:- 3117 // [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32] 3118 // o For safepoint polling instruction "test %eax,(%rax)", encoding of first register 3119 // operand and base register of memory operand is b/w [0-8), hence we do not require 3120 // additional REX prefix where REX.B bit stores MSB bit of register encoding, which 3121 // is why two bytes encoding is sufficient here. 3122 // o For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE 3123 // register of memory operand is 1000, thus we need additional REX prefix in this case, 3124 // there by adding additional byte to instruction encoding. 3125 // o In case BASE register is one of the 32 extended GPR registers available only on targets 3126 // supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold 3127 // most significant two bits of 5 bit register encoding. 3128 3129 if (VM_Version::supports_apx_f()) { 3130 __ cmpb(Address(rbx, 0), Assembler::REX2); 3131 __ jccb(Assembler::notEqual, check_rex_prefix); 3132 __ addptr(rbx, 2); 3133 __ bind(check_rex_prefix); 3134 } 3135 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix); 3136 __ jccb(Assembler::notEqual, no_prefix); 3137 __ addptr(rbx, 1); 3138 __ bind(no_prefix); 3139 #ifdef ASSERT 3140 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below 3141 #endif 3142 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values: 3143 // r12/rsp 0x04 3144 // r13/rbp 0x05 3145 __ movzbq(rcx, Address(rbx, 1)); 3146 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05 3147 __ subptr(rcx, 4); // looking for 0x00 .. 0x01 3148 __ cmpptr(rcx, 1); 3149 __ jccb(Assembler::above, not_special); 3150 __ addptr(rbx, 1); 3151 __ bind(not_special); 3152 #ifdef ASSERT 3153 // Verify the correct encoding of the poll we're about to skip. 3154 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl); 3155 __ jcc(Assembler::notEqual, bail); 3156 // Mask out the modrm bits 3157 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask); 3158 // rax encodes to 0, so if the bits are nonzero it's incorrect 3159 __ jcc(Assembler::notZero, bail); 3160 #endif 3161 // Adjust return pc forward to step over the safepoint poll instruction 3162 __ addptr(rbx, 2); 3163 __ movptr(Address(rbp, wordSize), rbx); 3164 } 3165 3166 __ bind(no_adjust); 3167 // Normal exit, restore registers and exit. 3168 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3169 __ ret(0); 3170 3171 #ifdef ASSERT 3172 __ bind(bail); 3173 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected"); 3174 #endif 3175 3176 // Make sure all code is generated 3177 masm->flush(); 3178 3179 // Fill-out other meta info 3180 return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words); 3181 } 3182 3183 // 3184 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss 3185 // 3186 // Generate a stub that calls into vm to find out the proper destination 3187 // of a java call. All the argument registers are live at this point 3188 // but since this is generic code we don't know what they are and the caller 3189 // must do any gc of the args. 3190 // 3191 RuntimeStub* SharedRuntime::generate_resolve_blob(SharedStubId id, address destination) { 3192 assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before"); 3193 assert(is_resolve_id(id), "expected a resolve stub id"); 3194 3195 // allocate space for the code 3196 ResourceMark rm; 3197 3198 const char* name = SharedRuntime::stub_name(id); 3199 CodeBuffer buffer(name, 1552, 512); 3200 MacroAssembler* masm = new MacroAssembler(&buffer); 3201 3202 int frame_size_in_words; 3203 3204 OopMapSet *oop_maps = new OopMapSet(); 3205 OopMap* map = nullptr; 3206 3207 int start = __ offset(); 3208 3209 // No need to save vector registers since they are caller-saved anyway. 3210 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false); 3211 3212 int frame_complete = __ offset(); 3213 3214 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 3215 3216 __ mov(c_rarg0, r15_thread); 3217 3218 __ call(RuntimeAddress(destination)); 3219 3220 3221 // Set an oopmap for the call site. 3222 // We need this not only for callee-saved registers, but also for volatile 3223 // registers that the compiler might be keeping live across a safepoint. 3224 3225 oop_maps->add_gc_map( __ offset() - start, map); 3226 3227 // rax contains the address we are going to jump to assuming no exception got installed 3228 3229 // clear last_Java_sp 3230 __ reset_last_Java_frame(false); 3231 // check for pending exceptions 3232 Label pending; 3233 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3234 __ jcc(Assembler::notEqual, pending); 3235 3236 // get the returned Method* 3237 __ get_vm_result_2(rbx, r15_thread); 3238 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx); 3239 3240 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 3241 3242 RegisterSaver::restore_live_registers(masm); 3243 3244 // We are back to the original state on entry and ready to go. 3245 3246 __ jmp(rax); 3247 3248 // Pending exception after the safepoint 3249 3250 __ bind(pending); 3251 3252 RegisterSaver::restore_live_registers(masm); 3253 3254 // exception pending => remove activation and forward to exception handler 3255 3256 __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD); 3257 3258 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 3259 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3260 3261 // ------------- 3262 // make sure all code is generated 3263 masm->flush(); 3264 3265 // return the blob 3266 // frame_size_words or bytes?? 3267 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); 3268 } 3269 3270 // Continuation point for throwing of implicit exceptions that are 3271 // not handled in the current activation. Fabricates an exception 3272 // oop and initiates normal exception dispatching in this 3273 // frame. Since we need to preserve callee-saved values (currently 3274 // only for C2, but done for C1 as well) we need a callee-saved oop 3275 // map and therefore have to make these stubs into RuntimeStubs 3276 // rather than BufferBlobs. If the compiler needs all registers to 3277 // be preserved between the fault point and the exception handler 3278 // then it must assume responsibility for that in 3279 // AbstractCompiler::continuation_for_implicit_null_exception or 3280 // continuation_for_implicit_division_by_zero_exception. All other 3281 // implicit exceptions (e.g., NullPointerException or 3282 // AbstractMethodError on entry) are either at call sites or 3283 // otherwise assume that stack unwinding will be initiated, so 3284 // caller saved registers were assumed volatile in the compiler. 3285 RuntimeStub* SharedRuntime::generate_throw_exception(SharedStubId id, address runtime_entry) { 3286 assert(is_throw_id(id), "expected a throw stub id"); 3287 3288 const char* name = SharedRuntime::stub_name(id); 3289 3290 // Information about frame layout at time of blocking runtime call. 3291 // Note that we only have to preserve callee-saved registers since 3292 // the compilers are responsible for supplying a continuation point 3293 // if they expect all registers to be preserved. 3294 enum layout { 3295 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt, 3296 rbp_off2, 3297 return_off, 3298 return_off2, 3299 framesize // inclusive of return address 3300 }; 3301 3302 int insts_size = 512; 3303 int locs_size = 64; 3304 3305 ResourceMark rm; 3306 const char* timer_msg = "SharedRuntime generate_throw_exception"; 3307 TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime)); 3308 3309 CodeBuffer code(name, insts_size, locs_size); 3310 OopMapSet* oop_maps = new OopMapSet(); 3311 MacroAssembler* masm = new MacroAssembler(&code); 3312 3313 address start = __ pc(); 3314 3315 // This is an inlined and slightly modified version of call_VM 3316 // which has the ability to fetch the return PC out of 3317 // thread-local storage and also sets up last_Java_sp slightly 3318 // differently than the real call_VM 3319 3320 __ enter(); // required for proper stackwalking of RuntimeStub frame 3321 3322 assert(is_even(framesize/2), "sp not 16-byte aligned"); 3323 3324 // return address and rbp are already in place 3325 __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog 3326 3327 int frame_complete = __ pc() - start; 3328 3329 // Set up last_Java_sp and last_Java_fp 3330 address the_pc = __ pc(); 3331 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 3332 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack 3333 3334 // Call runtime 3335 __ movptr(c_rarg0, r15_thread); 3336 BLOCK_COMMENT("call runtime_entry"); 3337 __ call(RuntimeAddress(runtime_entry)); 3338 3339 // Generate oop map 3340 OopMap* map = new OopMap(framesize, 0); 3341 3342 oop_maps->add_gc_map(the_pc - start, map); 3343 3344 __ reset_last_Java_frame(true); 3345 3346 __ leave(); // required for proper stackwalking of RuntimeStub frame 3347 3348 // check for pending exceptions 3349 #ifdef ASSERT 3350 Label L; 3351 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3352 __ jcc(Assembler::notEqual, L); 3353 __ should_not_reach_here(); 3354 __ bind(L); 3355 #endif // ASSERT 3356 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3357 3358 3359 // codeBlob framesize is in words (not VMRegImpl::slot_size) 3360 RuntimeStub* stub = 3361 RuntimeStub::new_runtime_stub(name, 3362 &code, 3363 frame_complete, 3364 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3365 oop_maps, false); 3366 return stub; 3367 } 3368 3369 //------------------------------Montgomery multiplication------------------------ 3370 // 3371 3372 #ifndef _WINDOWS 3373 3374 // Subtract 0:b from carry:a. Return carry. 3375 static julong 3376 sub(julong a[], julong b[], julong carry, long len) { 3377 long long i = 0, cnt = len; 3378 julong tmp; 3379 asm volatile("clc; " 3380 "0: ; " 3381 "mov (%[b], %[i], 8), %[tmp]; " 3382 "sbb %[tmp], (%[a], %[i], 8); " 3383 "inc %[i]; dec %[cnt]; " 3384 "jne 0b; " 3385 "mov %[carry], %[tmp]; sbb $0, %[tmp]; " 3386 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp) 3387 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry) 3388 : "memory"); 3389 return tmp; 3390 } 3391 3392 // Multiply (unsigned) Long A by Long B, accumulating the double- 3393 // length result into the accumulator formed of T0, T1, and T2. 3394 #define MACC(A, B, T0, T1, T2) \ 3395 do { \ 3396 unsigned long hi, lo; \ 3397 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3398 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3399 : "r"(A), "a"(B) : "cc"); \ 3400 } while(0) 3401 3402 // As above, but add twice the double-length result into the 3403 // accumulator. 3404 #define MACC2(A, B, T0, T1, T2) \ 3405 do { \ 3406 unsigned long hi, lo; \ 3407 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \ 3408 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3409 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3410 : "r"(A), "a"(B) : "cc"); \ 3411 } while(0) 3412 3413 #else //_WINDOWS 3414 3415 static julong 3416 sub(julong a[], julong b[], julong carry, long len) { 3417 long i; 3418 julong tmp; 3419 unsigned char c = 1; 3420 for (i = 0; i < len; i++) { 3421 c = _addcarry_u64(c, a[i], ~b[i], &tmp); 3422 a[i] = tmp; 3423 } 3424 c = _addcarry_u64(c, carry, ~0, &tmp); 3425 return tmp; 3426 } 3427 3428 // Multiply (unsigned) Long A by Long B, accumulating the double- 3429 // length result into the accumulator formed of T0, T1, and T2. 3430 #define MACC(A, B, T0, T1, T2) \ 3431 do { \ 3432 julong hi, lo; \ 3433 lo = _umul128(A, B, &hi); \ 3434 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3435 c = _addcarry_u64(c, hi, T1, &T1); \ 3436 _addcarry_u64(c, T2, 0, &T2); \ 3437 } while(0) 3438 3439 // As above, but add twice the double-length result into the 3440 // accumulator. 3441 #define MACC2(A, B, T0, T1, T2) \ 3442 do { \ 3443 julong hi, lo; \ 3444 lo = _umul128(A, B, &hi); \ 3445 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3446 c = _addcarry_u64(c, hi, T1, &T1); \ 3447 _addcarry_u64(c, T2, 0, &T2); \ 3448 c = _addcarry_u64(0, lo, T0, &T0); \ 3449 c = _addcarry_u64(c, hi, T1, &T1); \ 3450 _addcarry_u64(c, T2, 0, &T2); \ 3451 } while(0) 3452 3453 #endif //_WINDOWS 3454 3455 // Fast Montgomery multiplication. The derivation of the algorithm is 3456 // in A Cryptographic Library for the Motorola DSP56000, 3457 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3458 3459 static void NOINLINE 3460 montgomery_multiply(julong a[], julong b[], julong n[], 3461 julong m[], julong inv, int len) { 3462 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3463 int i; 3464 3465 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply"); 3466 3467 for (i = 0; i < len; i++) { 3468 int j; 3469 for (j = 0; j < i; j++) { 3470 MACC(a[j], b[i-j], t0, t1, t2); 3471 MACC(m[j], n[i-j], t0, t1, t2); 3472 } 3473 MACC(a[i], b[0], t0, t1, t2); 3474 m[i] = t0 * inv; 3475 MACC(m[i], n[0], t0, t1, t2); 3476 3477 assert(t0 == 0, "broken Montgomery multiply"); 3478 3479 t0 = t1; t1 = t2; t2 = 0; 3480 } 3481 3482 for (i = len; i < 2*len; i++) { 3483 int j; 3484 for (j = i-len+1; j < len; j++) { 3485 MACC(a[j], b[i-j], t0, t1, t2); 3486 MACC(m[j], n[i-j], t0, t1, t2); 3487 } 3488 m[i-len] = t0; 3489 t0 = t1; t1 = t2; t2 = 0; 3490 } 3491 3492 while (t0) 3493 t0 = sub(m, n, t0, len); 3494 } 3495 3496 // Fast Montgomery squaring. This uses asymptotically 25% fewer 3497 // multiplies so it should be up to 25% faster than Montgomery 3498 // multiplication. However, its loop control is more complex and it 3499 // may actually run slower on some machines. 3500 3501 static void NOINLINE 3502 montgomery_square(julong a[], julong n[], 3503 julong m[], julong inv, int len) { 3504 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3505 int i; 3506 3507 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square"); 3508 3509 for (i = 0; i < len; i++) { 3510 int j; 3511 int end = (i+1)/2; 3512 for (j = 0; j < end; j++) { 3513 MACC2(a[j], a[i-j], t0, t1, t2); 3514 MACC(m[j], n[i-j], t0, t1, t2); 3515 } 3516 if ((i & 1) == 0) { 3517 MACC(a[j], a[j], t0, t1, t2); 3518 } 3519 for (; j < i; j++) { 3520 MACC(m[j], n[i-j], t0, t1, t2); 3521 } 3522 m[i] = t0 * inv; 3523 MACC(m[i], n[0], t0, t1, t2); 3524 3525 assert(t0 == 0, "broken Montgomery square"); 3526 3527 t0 = t1; t1 = t2; t2 = 0; 3528 } 3529 3530 for (i = len; i < 2*len; i++) { 3531 int start = i-len+1; 3532 int end = start + (len - start)/2; 3533 int j; 3534 for (j = start; j < end; j++) { 3535 MACC2(a[j], a[i-j], t0, t1, t2); 3536 MACC(m[j], n[i-j], t0, t1, t2); 3537 } 3538 if ((i & 1) == 0) { 3539 MACC(a[j], a[j], t0, t1, t2); 3540 } 3541 for (; j < len; j++) { 3542 MACC(m[j], n[i-j], t0, t1, t2); 3543 } 3544 m[i-len] = t0; 3545 t0 = t1; t1 = t2; t2 = 0; 3546 } 3547 3548 while (t0) 3549 t0 = sub(m, n, t0, len); 3550 } 3551 3552 // Swap words in a longword. 3553 static julong swap(julong x) { 3554 return (x << 32) | (x >> 32); 3555 } 3556 3557 // Copy len longwords from s to d, word-swapping as we go. The 3558 // destination array is reversed. 3559 static void reverse_words(julong *s, julong *d, int len) { 3560 d += len; 3561 while(len-- > 0) { 3562 d--; 3563 *d = swap(*s); 3564 s++; 3565 } 3566 } 3567 3568 // The threshold at which squaring is advantageous was determined 3569 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. 3570 #define MONTGOMERY_SQUARING_THRESHOLD 64 3571 3572 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, 3573 jint len, jlong inv, 3574 jint *m_ints) { 3575 assert(len % 2 == 0, "array length in montgomery_multiply must be even"); 3576 int longwords = len/2; 3577 3578 // Make very sure we don't use so much space that the stack might 3579 // overflow. 512 jints corresponds to an 16384-bit integer and 3580 // will use here a total of 8k bytes of stack space. 3581 int divisor = sizeof(julong) * 4; 3582 guarantee(longwords <= 8192 / divisor, "must be"); 3583 int total_allocation = longwords * sizeof (julong) * 4; 3584 julong *scratch = (julong *)alloca(total_allocation); 3585 3586 // Local scratch arrays 3587 julong 3588 *a = scratch + 0 * longwords, 3589 *b = scratch + 1 * longwords, 3590 *n = scratch + 2 * longwords, 3591 *m = scratch + 3 * longwords; 3592 3593 reverse_words((julong *)a_ints, a, longwords); 3594 reverse_words((julong *)b_ints, b, longwords); 3595 reverse_words((julong *)n_ints, n, longwords); 3596 3597 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords); 3598 3599 reverse_words(m, (julong *)m_ints, longwords); 3600 } 3601 3602 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, 3603 jint len, jlong inv, 3604 jint *m_ints) { 3605 assert(len % 2 == 0, "array length in montgomery_square must be even"); 3606 int longwords = len/2; 3607 3608 // Make very sure we don't use so much space that the stack might 3609 // overflow. 512 jints corresponds to an 16384-bit integer and 3610 // will use here a total of 6k bytes of stack space. 3611 int divisor = sizeof(julong) * 3; 3612 guarantee(longwords <= (8192 / divisor), "must be"); 3613 int total_allocation = longwords * sizeof (julong) * 3; 3614 julong *scratch = (julong *)alloca(total_allocation); 3615 3616 // Local scratch arrays 3617 julong 3618 *a = scratch + 0 * longwords, 3619 *n = scratch + 1 * longwords, 3620 *m = scratch + 2 * longwords; 3621 3622 reverse_words((julong *)a_ints, a, longwords); 3623 reverse_words((julong *)n_ints, n, longwords); 3624 3625 if (len >= MONTGOMERY_SQUARING_THRESHOLD) { 3626 ::montgomery_square(a, n, m, (julong)inv, longwords); 3627 } else { 3628 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords); 3629 } 3630 3631 reverse_words(m, (julong *)m_ints, longwords); 3632 } 3633 3634 #if INCLUDE_JFR 3635 3636 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint. 3637 // It returns a jobject handle to the event writer. 3638 // The handle is dereferenced and the return value is the event writer oop. 3639 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() { 3640 enum layout { 3641 rbp_off, 3642 rbpH_off, 3643 return_off, 3644 return_off2, 3645 framesize // inclusive of return address 3646 }; 3647 3648 const char* name = SharedRuntime::stub_name(SharedStubId::jfr_write_checkpoint_id); 3649 CodeBuffer code(name, 1024, 64); 3650 MacroAssembler* masm = new MacroAssembler(&code); 3651 address start = __ pc(); 3652 3653 __ enter(); 3654 address the_pc = __ pc(); 3655 3656 int frame_complete = the_pc - start; 3657 3658 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 3659 __ movptr(c_rarg0, r15_thread); 3660 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1); 3661 __ reset_last_Java_frame(true); 3662 3663 // rax is jobject handle result, unpack and process it through a barrier. 3664 __ resolve_global_jobject(rax, r15_thread, c_rarg0); 3665 3666 __ leave(); 3667 __ ret(0); 3668 3669 OopMapSet* oop_maps = new OopMapSet(); 3670 OopMap* map = new OopMap(framesize, 1); 3671 oop_maps->add_gc_map(frame_complete, map); 3672 3673 RuntimeStub* stub = 3674 RuntimeStub::new_runtime_stub(name, 3675 &code, 3676 frame_complete, 3677 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3678 oop_maps, 3679 false); 3680 return stub; 3681 } 3682 3683 // For c2: call to return a leased buffer. 3684 RuntimeStub* SharedRuntime::generate_jfr_return_lease() { 3685 enum layout { 3686 rbp_off, 3687 rbpH_off, 3688 return_off, 3689 return_off2, 3690 framesize // inclusive of return address 3691 }; 3692 3693 const char* name = SharedRuntime::stub_name(SharedStubId::jfr_return_lease_id); 3694 CodeBuffer code(name, 1024, 64); 3695 MacroAssembler* masm = new MacroAssembler(&code); 3696 address start = __ pc(); 3697 3698 __ enter(); 3699 address the_pc = __ pc(); 3700 3701 int frame_complete = the_pc - start; 3702 3703 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2); 3704 __ movptr(c_rarg0, r15_thread); 3705 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1); 3706 __ reset_last_Java_frame(true); 3707 3708 __ leave(); 3709 __ ret(0); 3710 3711 OopMapSet* oop_maps = new OopMapSet(); 3712 OopMap* map = new OopMap(framesize, 1); 3713 oop_maps->add_gc_map(frame_complete, map); 3714 3715 RuntimeStub* stub = 3716 RuntimeStub::new_runtime_stub(name, 3717 &code, 3718 frame_complete, 3719 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3720 oop_maps, 3721 false); 3722 return stub; 3723 } 3724 3725 #endif // INCLUDE_JFR 3726