1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #ifndef _WINDOWS 26 #include "alloca.h" 27 #endif 28 #include "asm/macroAssembler.hpp" 29 #include "asm/macroAssembler.inline.hpp" 30 #include "code/compiledIC.hpp" 31 #include "code/debugInfoRec.hpp" 32 #include "code/nativeInst.hpp" 33 #include "code/vtableStubs.hpp" 34 #include "compiler/oopMap.hpp" 35 #include "gc/shared/collectedHeap.hpp" 36 #include "gc/shared/gcLocker.hpp" 37 #include "gc/shared/barrierSet.hpp" 38 #include "gc/shared/barrierSetAssembler.hpp" 39 #include "interpreter/interpreter.hpp" 40 #include "logging/log.hpp" 41 #include "memory/resourceArea.hpp" 42 #include "memory/universe.hpp" 43 #include "oops/klass.inline.hpp" 44 #include "oops/method.inline.hpp" 45 #include "prims/methodHandles.hpp" 46 #include "runtime/continuation.hpp" 47 #include "runtime/continuationEntry.inline.hpp" 48 #include "runtime/globals.hpp" 49 #include "runtime/jniHandles.hpp" 50 #include "runtime/safepointMechanism.hpp" 51 #include "runtime/sharedRuntime.hpp" 52 #include "runtime/signature.hpp" 53 #include "runtime/stubRoutines.hpp" 54 #include "runtime/timerTrace.hpp" 55 #include "runtime/vframeArray.hpp" 56 #include "runtime/vm_version.hpp" 57 #include "utilities/align.hpp" 58 #include "utilities/checkedCast.hpp" 59 #include "utilities/formatBuffer.hpp" 60 #include "vmreg_x86.inline.hpp" 61 #ifdef COMPILER1 62 #include "c1/c1_Runtime1.hpp" 63 #endif 64 #ifdef COMPILER2 65 #include "opto/runtime.hpp" 66 #endif 67 #if INCLUDE_JVMCI 68 #include "jvmci/jvmciJavaClasses.hpp" 69 #endif 70 71 #define __ masm-> 72 73 #ifdef PRODUCT 74 #define BLOCK_COMMENT(str) /* nothing */ 75 #else 76 #define BLOCK_COMMENT(str) __ block_comment(str) 77 #endif // PRODUCT 78 79 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size; 80 81 class RegisterSaver { 82 // Capture info about frame layout. Layout offsets are in jint 83 // units because compiler frame slots are jints. 84 #define XSAVE_AREA_BEGIN 160 85 #define XSAVE_AREA_YMM_BEGIN 576 86 #define XSAVE_AREA_EGPRS 960 87 #define XSAVE_AREA_OPMASK_BEGIN 1088 88 #define XSAVE_AREA_ZMM_BEGIN 1152 89 #define XSAVE_AREA_UPPERBANK 1664 90 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off 91 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off 92 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off 93 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off 94 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off 95 enum layout { 96 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area 97 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area 98 DEF_XMM_OFFS(0), 99 DEF_XMM_OFFS(1), 100 // 2..15 are implied in range usage 101 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 102 DEF_YMM_OFFS(0), 103 DEF_YMM_OFFS(1), 104 // 2..15 are implied in range usage 105 r31_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt, 106 r31H_off, 107 r30_off, r30H_off, 108 r29_off, r29H_off, 109 r28_off, r28H_off, 110 r27_off, r27H_off, 111 r26_off, r26H_off, 112 r25_off, r25H_off, 113 r24_off, r24H_off, 114 r23_off, r23H_off, 115 r22_off, r22H_off, 116 r21_off, r21H_off, 117 r20_off, r20H_off, 118 r19_off, r19H_off, 119 r18_off, r18H_off, 120 r17_off, r17H_off, 121 r16_off, r16H_off, 122 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 123 DEF_OPMASK_OFFS(0), 124 DEF_OPMASK_OFFS(1), 125 // 2..7 are implied in range usage 126 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 127 DEF_ZMM_OFFS(0), 128 DEF_ZMM_OFFS(1), 129 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt, 130 DEF_ZMM_UPPER_OFFS(16), 131 DEF_ZMM_UPPER_OFFS(17), 132 // 18..31 are implied in range usage 133 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt), 134 fpu_stateH_end, 135 r15_off, r15H_off, 136 r14_off, r14H_off, 137 r13_off, r13H_off, 138 r12_off, r12H_off, 139 r11_off, r11H_off, 140 r10_off, r10H_off, 141 r9_off, r9H_off, 142 r8_off, r8H_off, 143 rdi_off, rdiH_off, 144 rsi_off, rsiH_off, 145 ignore_off, ignoreH_off, // extra copy of rbp 146 rsp_off, rspH_off, 147 rbx_off, rbxH_off, 148 rdx_off, rdxH_off, 149 rcx_off, rcxH_off, 150 rax_off, raxH_off, 151 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state 152 align_off, alignH_off, 153 flags_off, flagsH_off, 154 // The frame sender code expects that rbp will be in the "natural" place and 155 // will override any oopMap setting for it. We must therefore force the layout 156 // so that it agrees with the frame sender code. 157 rbp_off, rbpH_off, // copy of rbp we will restore 158 return_off, returnH_off, // slot for return address 159 reg_save_size // size in compiler stack slots 160 }; 161 162 public: 163 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors); 164 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false); 165 166 // Offsets into the register save area 167 // Used by deoptimization when it is managing result register 168 // values on its own 169 170 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; } 171 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; } 172 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; } 173 static int r15_offset_in_bytes(void) { return BytesPerInt * r15_off; } 174 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; } 175 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; } 176 177 // During deoptimization only the result registers need to be restored, 178 // all the other values have already been extracted. 179 static void restore_result_registers(MacroAssembler* masm); 180 }; 181 182 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) { 183 int off = 0; 184 int num_xmm_regs = XMMRegister::available_xmm_registers(); 185 #if COMPILER2_OR_JVMCI 186 if (save_wide_vectors && UseAVX == 0) { 187 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX 188 } 189 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 190 #else 191 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI 192 #endif 193 194 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated 195 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs); 196 // OopMap frame size is in compiler stack slots (jint's) not bytes or words 197 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 198 // CodeBlob frame size is in words. 199 int frame_size_in_words = frame_size_in_bytes / wordSize; 200 *total_frame_words = frame_size_in_words; 201 202 // Save registers, fpu state, and flags. 203 // We assume caller has already pushed the return address onto the 204 // stack, so rsp is 8-byte aligned here. 205 // We push rpb twice in this sequence because we want the real rbp 206 // to be under the return like a normal enter. 207 208 __ enter(); // rsp becomes 16-byte aligned here 209 __ pushf(); 210 // Make sure rsp stays 16-byte aligned 211 __ subq(rsp, 8); 212 // Push CPU state in multiple of 16 bytes 213 __ save_legacy_gprs(); 214 __ push_FPU_state(); 215 216 217 // push cpu state handles this on EVEX enabled targets 218 if (save_wide_vectors) { 219 // Save upper half of YMM registers(0..15) 220 int base_addr = XSAVE_AREA_YMM_BEGIN; 221 for (int n = 0; n < 16; n++) { 222 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n)); 223 } 224 if (VM_Version::supports_evex()) { 225 // Save upper half of ZMM registers(0..15) 226 base_addr = XSAVE_AREA_ZMM_BEGIN; 227 for (int n = 0; n < 16; n++) { 228 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n)); 229 } 230 // Save full ZMM registers(16..num_xmm_regs) 231 base_addr = XSAVE_AREA_UPPERBANK; 232 off = 0; 233 int vector_len = Assembler::AVX_512bit; 234 for (int n = 16; n < num_xmm_regs; n++) { 235 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 236 } 237 #if COMPILER2_OR_JVMCI 238 base_addr = XSAVE_AREA_OPMASK_BEGIN; 239 off = 0; 240 for(int n = 0; n < KRegister::number_of_registers; n++) { 241 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 242 } 243 #endif 244 } 245 } else { 246 if (VM_Version::supports_evex()) { 247 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 248 int base_addr = XSAVE_AREA_UPPERBANK; 249 off = 0; 250 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 251 for (int n = 16; n < num_xmm_regs; n++) { 252 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 253 } 254 #if COMPILER2_OR_JVMCI 255 base_addr = XSAVE_AREA_OPMASK_BEGIN; 256 off = 0; 257 for(int n = 0; n < KRegister::number_of_registers; n++) { 258 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 259 } 260 #endif 261 } 262 } 263 264 #if COMPILER2_OR_JVMCI 265 if (UseAPX) { 266 int base_addr = XSAVE_AREA_EGPRS; 267 off = 0; 268 for(int n = 16; n < Register::number_of_registers; n++) { 269 __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n)); 270 } 271 } 272 #endif 273 274 __ vzeroupper(); 275 if (frame::arg_reg_save_area_bytes != 0) { 276 // Allocate argument register save area 277 __ subptr(rsp, frame::arg_reg_save_area_bytes); 278 } 279 280 // Set an oopmap for the call site. This oopmap will map all 281 // oop-registers and debug-info registers as callee-saved. This 282 // will allow deoptimization at this safepoint to find all possible 283 // debug-info recordings, as well as let GC find all oops. 284 285 OopMapSet *oop_maps = new OopMapSet(); 286 OopMap* map = new OopMap(frame_size_in_slots, 0); 287 288 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x)) 289 290 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg()); 291 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg()); 292 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg()); 293 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg()); 294 // rbp location is known implicitly by the frame sender code, needs no oopmap 295 // and the location where rbp was saved by is ignored 296 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg()); 297 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg()); 298 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg()); 299 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg()); 300 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg()); 301 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg()); 302 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg()); 303 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg()); 304 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg()); 305 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg()); 306 307 if (UseAPX) { 308 map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg()); 309 map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg()); 310 map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg()); 311 map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg()); 312 map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg()); 313 map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg()); 314 map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg()); 315 map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg()); 316 map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg()); 317 map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg()); 318 map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg()); 319 map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg()); 320 map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg()); 321 map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg()); 322 map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg()); 323 map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg()); 324 } 325 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 326 // on EVEX enabled targets, we get it included in the xsave area 327 off = xmm0_off; 328 int delta = xmm1_off - off; 329 for (int n = 0; n < 16; n++) { 330 XMMRegister xmm_name = as_XMMRegister(n); 331 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()); 332 off += delta; 333 } 334 if (UseAVX > 2) { 335 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 336 off = zmm16_off; 337 delta = zmm17_off - off; 338 for (int n = 16; n < num_xmm_regs; n++) { 339 XMMRegister zmm_name = as_XMMRegister(n); 340 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()); 341 off += delta; 342 } 343 } 344 345 #if COMPILER2_OR_JVMCI 346 if (save_wide_vectors) { 347 // Save upper half of YMM registers(0..15) 348 off = ymm0_off; 349 delta = ymm1_off - ymm0_off; 350 for (int n = 0; n < 16; n++) { 351 XMMRegister ymm_name = as_XMMRegister(n); 352 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4)); 353 off += delta; 354 } 355 if (VM_Version::supports_evex()) { 356 // Save upper half of ZMM registers(0..15) 357 off = zmm0_off; 358 delta = zmm1_off - zmm0_off; 359 for (int n = 0; n < 16; n++) { 360 XMMRegister zmm_name = as_XMMRegister(n); 361 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8)); 362 off += delta; 363 } 364 } 365 } 366 #endif // COMPILER2_OR_JVMCI 367 368 // %%% These should all be a waste but we'll keep things as they were for now 369 if (true) { 370 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next()); 371 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next()); 372 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next()); 373 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next()); 374 // rbp location is known implicitly by the frame sender code, needs no oopmap 375 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next()); 376 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next()); 377 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next()); 378 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next()); 379 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next()); 380 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next()); 381 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next()); 382 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next()); 383 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next()); 384 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next()); 385 if (UseAPX) { 386 map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next()); 387 map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next()); 388 map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next()); 389 map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next()); 390 map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next()); 391 map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next()); 392 map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next()); 393 map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next()); 394 map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next()); 395 map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next()); 396 map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next()); 397 map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next()); 398 map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next()); 399 map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next()); 400 map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next()); 401 map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next()); 402 } 403 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 404 // on EVEX enabled targets, we get it included in the xsave area 405 off = xmm0H_off; 406 delta = xmm1H_off - off; 407 for (int n = 0; n < 16; n++) { 408 XMMRegister xmm_name = as_XMMRegister(n); 409 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next()); 410 off += delta; 411 } 412 if (UseAVX > 2) { 413 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 414 off = zmm16H_off; 415 delta = zmm17H_off - off; 416 for (int n = 16; n < num_xmm_regs; n++) { 417 XMMRegister zmm_name = as_XMMRegister(n); 418 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next()); 419 off += delta; 420 } 421 } 422 } 423 424 return map; 425 } 426 427 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) { 428 int num_xmm_regs = XMMRegister::available_xmm_registers(); 429 if (frame::arg_reg_save_area_bytes != 0) { 430 // Pop arg register save area 431 __ addptr(rsp, frame::arg_reg_save_area_bytes); 432 } 433 434 #if COMPILER2_OR_JVMCI 435 if (restore_wide_vectors) { 436 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); 437 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 438 } 439 #else 440 assert(!restore_wide_vectors, "vectors are generated only by C2"); 441 #endif 442 443 __ vzeroupper(); 444 445 // On EVEX enabled targets everything is handled in pop fpu state 446 if (restore_wide_vectors) { 447 // Restore upper half of YMM registers (0..15) 448 int base_addr = XSAVE_AREA_YMM_BEGIN; 449 for (int n = 0; n < 16; n++) { 450 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16)); 451 } 452 if (VM_Version::supports_evex()) { 453 // Restore upper half of ZMM registers (0..15) 454 base_addr = XSAVE_AREA_ZMM_BEGIN; 455 for (int n = 0; n < 16; n++) { 456 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32)); 457 } 458 // Restore full ZMM registers(16..num_xmm_regs) 459 base_addr = XSAVE_AREA_UPPERBANK; 460 int vector_len = Assembler::AVX_512bit; 461 int off = 0; 462 for (int n = 16; n < num_xmm_regs; n++) { 463 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 464 } 465 #if COMPILER2_OR_JVMCI 466 base_addr = XSAVE_AREA_OPMASK_BEGIN; 467 off = 0; 468 for (int n = 0; n < KRegister::number_of_registers; n++) { 469 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 470 } 471 #endif 472 } 473 } else { 474 if (VM_Version::supports_evex()) { 475 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 476 int base_addr = XSAVE_AREA_UPPERBANK; 477 int off = 0; 478 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 479 for (int n = 16; n < num_xmm_regs; n++) { 480 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 481 } 482 #if COMPILER2_OR_JVMCI 483 base_addr = XSAVE_AREA_OPMASK_BEGIN; 484 off = 0; 485 for (int n = 0; n < KRegister::number_of_registers; n++) { 486 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 487 } 488 #endif 489 } 490 } 491 492 #if COMPILER2_OR_JVMCI 493 if (UseAPX) { 494 int base_addr = XSAVE_AREA_EGPRS; 495 int off = 0; 496 for (int n = 16; n < Register::number_of_registers; n++) { 497 __ movq(as_Register(n), Address(rsp, base_addr+(off++*8))); 498 } 499 } 500 #endif 501 502 // Recover CPU state 503 __ pop_FPU_state(); 504 __ restore_legacy_gprs(); 505 __ addq(rsp, 8); 506 __ popf(); 507 // Get the rbp described implicitly by the calling convention (no oopMap) 508 __ pop(rbp); 509 } 510 511 void RegisterSaver::restore_result_registers(MacroAssembler* masm) { 512 513 // Just restore result register. Only used by deoptimization. By 514 // now any callee save register that needs to be restored to a c2 515 // caller of the deoptee has been extracted into the vframeArray 516 // and will be stuffed into the c2i adapter we create for later 517 // restoration so only result registers need to be restored here. 518 519 // Restore fp result register 520 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes())); 521 // Restore integer result register 522 __ movptr(rax, Address(rsp, rax_offset_in_bytes())); 523 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes())); 524 525 // Pop all of the register save are off the stack except the return address 526 __ addptr(rsp, return_offset_in_bytes()); 527 } 528 529 // Is vector's size (in bytes) bigger than a size saved by default? 530 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions. 531 bool SharedRuntime::is_wide_vector(int size) { 532 return size > 16; 533 } 534 535 // --------------------------------------------------------------------------- 536 // Read the array of BasicTypes from a signature, and compute where the 537 // arguments should go. Values in the VMRegPair regs array refer to 4-byte 538 // quantities. Values less than VMRegImpl::stack0 are registers, those above 539 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer 540 // as framesizes are fixed. 541 // VMRegImpl::stack0 refers to the first slot 0(sp). 542 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher. 543 // Register up to Register::number_of_registers are the 64-bit 544 // integer registers. 545 546 // Note: the INPUTS in sig_bt are in units of Java argument words, which are 547 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit 548 // units regardless of build. Of course for i486 there is no 64 bit build 549 550 // The Java calling convention is a "shifted" version of the C ABI. 551 // By skipping the first C ABI register we can call non-static jni methods 552 // with small numbers of arguments without having to shuffle the arguments 553 // at all. Since we control the java ABI we ought to at least get some 554 // advantage out of it. 555 556 int SharedRuntime::java_calling_convention(const BasicType *sig_bt, 557 VMRegPair *regs, 558 int total_args_passed) { 559 560 // Create the mapping between argument positions and 561 // registers. 562 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = { 563 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5 564 }; 565 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = { 566 j_farg0, j_farg1, j_farg2, j_farg3, 567 j_farg4, j_farg5, j_farg6, j_farg7 568 }; 569 570 571 uint int_args = 0; 572 uint fp_args = 0; 573 uint stk_args = 0; 574 575 for (int i = 0; i < total_args_passed; i++) { 576 switch (sig_bt[i]) { 577 case T_BOOLEAN: 578 case T_CHAR: 579 case T_BYTE: 580 case T_SHORT: 581 case T_INT: 582 if (int_args < Argument::n_int_register_parameters_j) { 583 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 584 } else { 585 stk_args = align_up(stk_args, 2); 586 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 587 stk_args += 1; 588 } 589 break; 590 case T_VOID: 591 // halves of T_LONG or T_DOUBLE 592 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 593 regs[i].set_bad(); 594 break; 595 case T_LONG: 596 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 597 // fall through 598 case T_OBJECT: 599 case T_ARRAY: 600 case T_ADDRESS: 601 if (int_args < Argument::n_int_register_parameters_j) { 602 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 603 } else { 604 stk_args = align_up(stk_args, 2); 605 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 606 stk_args += 2; 607 } 608 break; 609 case T_FLOAT: 610 if (fp_args < Argument::n_float_register_parameters_j) { 611 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 612 } else { 613 stk_args = align_up(stk_args, 2); 614 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 615 stk_args += 1; 616 } 617 break; 618 case T_DOUBLE: 619 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 620 if (fp_args < Argument::n_float_register_parameters_j) { 621 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 622 } else { 623 stk_args = align_up(stk_args, 2); 624 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 625 stk_args += 2; 626 } 627 break; 628 default: 629 ShouldNotReachHere(); 630 break; 631 } 632 } 633 634 return stk_args; 635 } 636 637 // Patch the callers callsite with entry to compiled code if it exists. 638 static void patch_callers_callsite(MacroAssembler *masm) { 639 Label L; 640 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 641 __ jcc(Assembler::equal, L); 642 643 // Save the current stack pointer 644 __ mov(r13, rsp); 645 // Schedule the branch target address early. 646 // Call into the VM to patch the caller, then jump to compiled callee 647 // rax isn't live so capture return address while we easily can 648 __ movptr(rax, Address(rsp, 0)); 649 650 // align stack so push_CPU_state doesn't fault 651 __ andptr(rsp, -(StackAlignmentInBytes)); 652 __ push_CPU_state(); 653 __ vzeroupper(); 654 // VM needs caller's callsite 655 // VM needs target method 656 // This needs to be a long call since we will relocate this adapter to 657 // the codeBuffer and it may not reach 658 659 // Allocate argument register save area 660 if (frame::arg_reg_save_area_bytes != 0) { 661 __ subptr(rsp, frame::arg_reg_save_area_bytes); 662 } 663 __ mov(c_rarg0, rbx); 664 __ mov(c_rarg1, rax); 665 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite))); 666 667 // De-allocate argument register save area 668 if (frame::arg_reg_save_area_bytes != 0) { 669 __ addptr(rsp, frame::arg_reg_save_area_bytes); 670 } 671 672 __ vzeroupper(); 673 __ pop_CPU_state(); 674 // restore sp 675 __ mov(rsp, r13); 676 __ bind(L); 677 } 678 679 680 static void gen_c2i_adapter(MacroAssembler *masm, 681 int total_args_passed, 682 int comp_args_on_stack, 683 const BasicType *sig_bt, 684 const VMRegPair *regs, 685 Label& skip_fixup) { 686 // Before we get into the guts of the C2I adapter, see if we should be here 687 // at all. We've come from compiled code and are attempting to jump to the 688 // interpreter, which means the caller made a static call to get here 689 // (vcalls always get a compiled target if there is one). Check for a 690 // compiled target. If there is one, we need to patch the caller's call. 691 patch_callers_callsite(masm); 692 693 __ bind(skip_fixup); 694 695 // Since all args are passed on the stack, total_args_passed * 696 // Interpreter::stackElementSize is the space we need. 697 698 assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed); 699 700 int extraspace = (total_args_passed * Interpreter::stackElementSize); 701 702 // stack is aligned, keep it that way 703 // This is not currently needed or enforced by the interpreter, but 704 // we might as well conform to the ABI. 705 extraspace = align_up(extraspace, 2*wordSize); 706 707 // set senderSP value 708 __ lea(r13, Address(rsp, wordSize)); 709 710 #ifdef ASSERT 711 __ check_stack_alignment(r13, "sender stack not aligned"); 712 #endif 713 if (extraspace > 0) { 714 // Pop the return address 715 __ pop(rax); 716 717 __ subptr(rsp, extraspace); 718 719 // Push the return address 720 __ push(rax); 721 722 // Account for the return address location since we store it first rather 723 // than hold it in a register across all the shuffling 724 extraspace += wordSize; 725 } 726 727 #ifdef ASSERT 728 __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax); 729 #endif 730 731 // Now write the args into the outgoing interpreter space 732 for (int i = 0; i < total_args_passed; i++) { 733 if (sig_bt[i] == T_VOID) { 734 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 735 continue; 736 } 737 738 // offset to start parameters 739 int st_off = (total_args_passed - i) * Interpreter::stackElementSize; 740 int next_off = st_off - Interpreter::stackElementSize; 741 742 // Say 4 args: 743 // i st_off 744 // 0 32 T_LONG 745 // 1 24 T_VOID 746 // 2 16 T_OBJECT 747 // 3 8 T_BOOL 748 // - 0 return address 749 // 750 // However to make thing extra confusing. Because we can fit a long/double in 751 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter 752 // leaves one slot empty and only stores to a single slot. In this case the 753 // slot that is occupied is the T_VOID slot. See I said it was confusing. 754 755 VMReg r_1 = regs[i].first(); 756 VMReg r_2 = regs[i].second(); 757 if (!r_1->is_valid()) { 758 assert(!r_2->is_valid(), ""); 759 continue; 760 } 761 if (r_1->is_stack()) { 762 // memory to memory use rax 763 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace; 764 if (!r_2->is_valid()) { 765 // sign extend?? 766 __ movl(rax, Address(rsp, ld_off)); 767 __ movptr(Address(rsp, st_off), rax); 768 769 } else { 770 771 __ movq(rax, Address(rsp, ld_off)); 772 773 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 774 // T_DOUBLE and T_LONG use two slots in the interpreter 775 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 776 // ld_off == LSW, ld_off+wordSize == MSW 777 // st_off == MSW, next_off == LSW 778 __ movq(Address(rsp, next_off), rax); 779 #ifdef ASSERT 780 // Overwrite the unused slot with known junk 781 __ mov64(rax, CONST64(0xdeadffffdeadaaaa)); 782 __ movptr(Address(rsp, st_off), rax); 783 #endif /* ASSERT */ 784 } else { 785 __ movq(Address(rsp, st_off), rax); 786 } 787 } 788 } else if (r_1->is_Register()) { 789 Register r = r_1->as_Register(); 790 if (!r_2->is_valid()) { 791 // must be only an int (or less ) so move only 32bits to slot 792 // why not sign extend?? 793 __ movl(Address(rsp, st_off), r); 794 } else { 795 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 796 // T_DOUBLE and T_LONG use two slots in the interpreter 797 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 798 // long/double in gpr 799 #ifdef ASSERT 800 // Overwrite the unused slot with known junk 801 __ mov64(rax, CONST64(0xdeadffffdeadaaab)); 802 __ movptr(Address(rsp, st_off), rax); 803 #endif /* ASSERT */ 804 __ movq(Address(rsp, next_off), r); 805 } else { 806 __ movptr(Address(rsp, st_off), r); 807 } 808 } 809 } else { 810 assert(r_1->is_XMMRegister(), ""); 811 if (!r_2->is_valid()) { 812 // only a float use just part of the slot 813 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister()); 814 } else { 815 #ifdef ASSERT 816 // Overwrite the unused slot with known junk 817 __ mov64(rax, CONST64(0xdeadffffdeadaaac)); 818 __ movptr(Address(rsp, st_off), rax); 819 #endif /* ASSERT */ 820 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister()); 821 } 822 } 823 } 824 825 // Schedule the branch target address early. 826 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset()))); 827 __ jmp(rcx); 828 } 829 830 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg, 831 address code_start, address code_end, 832 Label& L_ok) { 833 Label L_fail; 834 __ lea(temp_reg, AddressLiteral(code_start, relocInfo::none)); 835 __ cmpptr(pc_reg, temp_reg); 836 __ jcc(Assembler::belowEqual, L_fail); 837 __ lea(temp_reg, AddressLiteral(code_end, relocInfo::none)); 838 __ cmpptr(pc_reg, temp_reg); 839 __ jcc(Assembler::below, L_ok); 840 __ bind(L_fail); 841 } 842 843 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm, 844 int total_args_passed, 845 int comp_args_on_stack, 846 const BasicType *sig_bt, 847 const VMRegPair *regs) { 848 849 // Note: r13 contains the senderSP on entry. We must preserve it since 850 // we may do a i2c -> c2i transition if we lose a race where compiled 851 // code goes non-entrant while we get args ready. 852 // In addition we use r13 to locate all the interpreter args as 853 // we must align the stack to 16 bytes on an i2c entry else we 854 // lose alignment we expect in all compiled code and register 855 // save code can segv when fxsave instructions find improperly 856 // aligned stack pointer. 857 858 // Adapters can be frameless because they do not require the caller 859 // to perform additional cleanup work, such as correcting the stack pointer. 860 // An i2c adapter is frameless because the *caller* frame, which is interpreted, 861 // routinely repairs its own stack pointer (from interpreter_frame_last_sp), 862 // even if a callee has modified the stack pointer. 863 // A c2i adapter is frameless because the *callee* frame, which is interpreted, 864 // routinely repairs its caller's stack pointer (from sender_sp, which is set 865 // up via the senderSP register). 866 // In other words, if *either* the caller or callee is interpreted, we can 867 // get the stack pointer repaired after a call. 868 // This is why c2i and i2c adapters cannot be indefinitely composed. 869 // In particular, if a c2i adapter were to somehow call an i2c adapter, 870 // both caller and callee would be compiled methods, and neither would 871 // clean up the stack pointer changes performed by the two adapters. 872 // If this happens, control eventually transfers back to the compiled 873 // caller, but with an uncorrected stack, causing delayed havoc. 874 875 if (VerifyAdapterCalls && 876 (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) { 877 // So, let's test for cascading c2i/i2c adapters right now. 878 // assert(Interpreter::contains($return_addr) || 879 // StubRoutines::contains($return_addr), 880 // "i2c adapter must return to an interpreter frame"); 881 __ block_comment("verify_i2c { "); 882 // Pick up the return address 883 __ movptr(rax, Address(rsp, 0)); 884 Label L_ok; 885 if (Interpreter::code() != nullptr) { 886 range_check(masm, rax, r11, 887 Interpreter::code()->code_start(), 888 Interpreter::code()->code_end(), 889 L_ok); 890 } 891 if (StubRoutines::initial_stubs_code() != nullptr) { 892 range_check(masm, rax, r11, 893 StubRoutines::initial_stubs_code()->code_begin(), 894 StubRoutines::initial_stubs_code()->code_end(), 895 L_ok); 896 } 897 if (StubRoutines::final_stubs_code() != nullptr) { 898 range_check(masm, rax, r11, 899 StubRoutines::final_stubs_code()->code_begin(), 900 StubRoutines::final_stubs_code()->code_end(), 901 L_ok); 902 } 903 const char* msg = "i2c adapter must return to an interpreter frame"; 904 __ block_comment(msg); 905 __ stop(msg); 906 __ bind(L_ok); 907 __ block_comment("} verify_i2ce "); 908 } 909 910 // Must preserve original SP for loading incoming arguments because 911 // we need to align the outgoing SP for compiled code. 912 __ movptr(r11, rsp); 913 914 // Pick up the return address 915 __ pop(rax); 916 917 // Convert 4-byte c2 stack slots to words. 918 int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord; 919 920 if (comp_args_on_stack) { 921 __ subptr(rsp, comp_words_on_stack * wordSize); 922 } 923 924 // Ensure compiled code always sees stack at proper alignment 925 __ andptr(rsp, -16); 926 927 // push the return address and misalign the stack that youngest frame always sees 928 // as far as the placement of the call instruction 929 __ push(rax); 930 931 // Put saved SP in another register 932 const Register saved_sp = rax; 933 __ movptr(saved_sp, r11); 934 935 // Will jump to the compiled code just as if compiled code was doing it. 936 // Pre-load the register-jump target early, to schedule it better. 937 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset()))); 938 939 #if INCLUDE_JVMCI 940 if (EnableJVMCI) { 941 // check if this call should be routed towards a specific entry point 942 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 943 Label no_alternative_target; 944 __ jcc(Assembler::equal, no_alternative_target); 945 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset()))); 946 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 947 __ bind(no_alternative_target); 948 } 949 #endif // INCLUDE_JVMCI 950 951 // Now generate the shuffle code. Pick up all register args and move the 952 // rest through the floating point stack top. 953 for (int i = 0; i < total_args_passed; i++) { 954 if (sig_bt[i] == T_VOID) { 955 // Longs and doubles are passed in native word order, but misaligned 956 // in the 32-bit build. 957 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 958 continue; 959 } 960 961 // Pick up 0, 1 or 2 words from SP+offset. 962 963 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), 964 "scrambled load targets?"); 965 // Load in argument order going down. 966 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize; 967 // Point to interpreter value (vs. tag) 968 int next_off = ld_off - Interpreter::stackElementSize; 969 // 970 // 971 // 972 VMReg r_1 = regs[i].first(); 973 VMReg r_2 = regs[i].second(); 974 if (!r_1->is_valid()) { 975 assert(!r_2->is_valid(), ""); 976 continue; 977 } 978 if (r_1->is_stack()) { 979 // Convert stack slot to an SP offset (+ wordSize to account for return address ) 980 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize; 981 982 // We can use r13 as a temp here because compiled code doesn't need r13 as an input 983 // and if we end up going thru a c2i because of a miss a reasonable value of r13 984 // will be generated. 985 if (!r_2->is_valid()) { 986 // sign extend??? 987 __ movl(r13, Address(saved_sp, ld_off)); 988 __ movptr(Address(rsp, st_off), r13); 989 } else { 990 // 991 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 992 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 993 // So we must adjust where to pick up the data to match the interpreter. 994 // 995 // Interpreter local[n] == MSW, local[n+1] == LSW however locals 996 // are accessed as negative so LSW is at LOW address 997 998 // ld_off is MSW so get LSW 999 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 1000 next_off : ld_off; 1001 __ movq(r13, Address(saved_sp, offset)); 1002 // st_off is LSW (i.e. reg.first()) 1003 __ movq(Address(rsp, st_off), r13); 1004 } 1005 } else if (r_1->is_Register()) { // Register argument 1006 Register r = r_1->as_Register(); 1007 assert(r != rax, "must be different"); 1008 if (r_2->is_valid()) { 1009 // 1010 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 1011 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 1012 // So we must adjust where to pick up the data to match the interpreter. 1013 1014 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 1015 next_off : ld_off; 1016 1017 // this can be a misaligned move 1018 __ movq(r, Address(saved_sp, offset)); 1019 } else { 1020 // sign extend and use a full word? 1021 __ movl(r, Address(saved_sp, ld_off)); 1022 } 1023 } else { 1024 if (!r_2->is_valid()) { 1025 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off)); 1026 } else { 1027 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off)); 1028 } 1029 } 1030 } 1031 1032 __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about 1033 1034 // 6243940 We might end up in handle_wrong_method if 1035 // the callee is deoptimized as we race thru here. If that 1036 // happens we don't want to take a safepoint because the 1037 // caller frame will look interpreted and arguments are now 1038 // "compiled" so it is much better to make this transition 1039 // invisible to the stack walking code. Unfortunately if 1040 // we try and find the callee by normal means a safepoint 1041 // is possible. So we stash the desired callee in the thread 1042 // and the vm will find there should this case occur. 1043 1044 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx); 1045 1046 // put Method* where a c2i would expect should we end up there 1047 // only needed because eof c2 resolve stubs return Method* as a result in 1048 // rax 1049 __ mov(rax, rbx); 1050 __ jmp(r11); 1051 } 1052 1053 // --------------------------------------------------------------- 1054 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm, 1055 int total_args_passed, 1056 int comp_args_on_stack, 1057 const BasicType *sig_bt, 1058 const VMRegPair *regs, 1059 AdapterFingerPrint* fingerprint) { 1060 address i2c_entry = __ pc(); 1061 1062 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs); 1063 1064 // ------------------------------------------------------------------------- 1065 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls 1066 // to the interpreter. The args start out packed in the compiled layout. They 1067 // need to be unpacked into the interpreter layout. This will almost always 1068 // require some stack space. We grow the current (compiled) stack, then repack 1069 // the args. We finally end in a jump to the generic interpreter entry point. 1070 // On exit from the interpreter, the interpreter will restore our SP (lest the 1071 // compiled code, which relies solely on SP and not RBP, get sick). 1072 1073 address c2i_unverified_entry = __ pc(); 1074 Label skip_fixup; 1075 1076 Register data = rax; 1077 Register receiver = j_rarg0; 1078 Register temp = rbx; 1079 1080 { 1081 __ ic_check(1 /* end_alignment */); 1082 __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset())); 1083 // Method might have been compiled since the call site was patched to 1084 // interpreted if that is the case treat it as a miss so we can get 1085 // the call site corrected. 1086 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 1087 __ jcc(Assembler::equal, skip_fixup); 1088 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1089 } 1090 1091 address c2i_entry = __ pc(); 1092 1093 // Class initialization barrier for static methods 1094 address c2i_no_clinit_check_entry = nullptr; 1095 if (VM_Version::supports_fast_class_init_checks()) { 1096 Label L_skip_barrier; 1097 Register method = rbx; 1098 1099 { // Bypass the barrier for non-static methods 1100 Register flags = rscratch1; 1101 __ load_unsigned_short(flags, Address(method, Method::access_flags_offset())); 1102 __ testl(flags, JVM_ACC_STATIC); 1103 __ jcc(Assembler::zero, L_skip_barrier); // non-static 1104 } 1105 1106 Register klass = rscratch1; 1107 __ load_method_holder(klass, method); 1108 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1109 1110 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1111 1112 __ bind(L_skip_barrier); 1113 c2i_no_clinit_check_entry = __ pc(); 1114 } 1115 1116 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1117 bs->c2i_entry_barrier(masm); 1118 1119 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup); 1120 1121 return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry); 1122 } 1123 1124 int SharedRuntime::c_calling_convention(const BasicType *sig_bt, 1125 VMRegPair *regs, 1126 int total_args_passed) { 1127 1128 // We return the amount of VMRegImpl stack slots we need to reserve for all 1129 // the arguments NOT counting out_preserve_stack_slots. 1130 1131 // NOTE: These arrays will have to change when c1 is ported 1132 #ifdef _WIN64 1133 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1134 c_rarg0, c_rarg1, c_rarg2, c_rarg3 1135 }; 1136 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1137 c_farg0, c_farg1, c_farg2, c_farg3 1138 }; 1139 #else 1140 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1141 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5 1142 }; 1143 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1144 c_farg0, c_farg1, c_farg2, c_farg3, 1145 c_farg4, c_farg5, c_farg6, c_farg7 1146 }; 1147 #endif // _WIN64 1148 1149 1150 uint int_args = 0; 1151 uint fp_args = 0; 1152 uint stk_args = 0; // inc by 2 each time 1153 1154 for (int i = 0; i < total_args_passed; i++) { 1155 switch (sig_bt[i]) { 1156 case T_BOOLEAN: 1157 case T_CHAR: 1158 case T_BYTE: 1159 case T_SHORT: 1160 case T_INT: 1161 if (int_args < Argument::n_int_register_parameters_c) { 1162 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 1163 #ifdef _WIN64 1164 fp_args++; 1165 // Allocate slots for callee to stuff register args the stack. 1166 stk_args += 2; 1167 #endif 1168 } else { 1169 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1170 stk_args += 2; 1171 } 1172 break; 1173 case T_LONG: 1174 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1175 // fall through 1176 case T_OBJECT: 1177 case T_ARRAY: 1178 case T_ADDRESS: 1179 case T_METADATA: 1180 if (int_args < Argument::n_int_register_parameters_c) { 1181 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 1182 #ifdef _WIN64 1183 fp_args++; 1184 stk_args += 2; 1185 #endif 1186 } else { 1187 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1188 stk_args += 2; 1189 } 1190 break; 1191 case T_FLOAT: 1192 if (fp_args < Argument::n_float_register_parameters_c) { 1193 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 1194 #ifdef _WIN64 1195 int_args++; 1196 // Allocate slots for callee to stuff register args the stack. 1197 stk_args += 2; 1198 #endif 1199 } else { 1200 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1201 stk_args += 2; 1202 } 1203 break; 1204 case T_DOUBLE: 1205 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1206 if (fp_args < Argument::n_float_register_parameters_c) { 1207 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 1208 #ifdef _WIN64 1209 int_args++; 1210 // Allocate slots for callee to stuff register args the stack. 1211 stk_args += 2; 1212 #endif 1213 } else { 1214 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1215 stk_args += 2; 1216 } 1217 break; 1218 case T_VOID: // Halves of longs and doubles 1219 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 1220 regs[i].set_bad(); 1221 break; 1222 default: 1223 ShouldNotReachHere(); 1224 break; 1225 } 1226 } 1227 #ifdef _WIN64 1228 // windows abi requires that we always allocate enough stack space 1229 // for 4 64bit registers to be stored down. 1230 if (stk_args < 8) { 1231 stk_args = 8; 1232 } 1233 #endif // _WIN64 1234 1235 return stk_args; 1236 } 1237 1238 int SharedRuntime::vector_calling_convention(VMRegPair *regs, 1239 uint num_bits, 1240 uint total_args_passed) { 1241 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512, 1242 "only certain vector sizes are supported for now"); 1243 1244 static const XMMRegister VEC_ArgReg[32] = { 1245 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 1246 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1247 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23, 1248 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31 1249 }; 1250 1251 uint stk_args = 0; 1252 uint fp_args = 0; 1253 1254 for (uint i = 0; i < total_args_passed; i++) { 1255 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg(); 1256 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15)); 1257 regs[i].set_pair(vmreg->next(next_val), vmreg); 1258 } 1259 1260 return stk_args; 1261 } 1262 1263 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1264 // We always ignore the frame_slots arg and just use the space just below frame pointer 1265 // which by this time is free to use 1266 switch (ret_type) { 1267 case T_FLOAT: 1268 __ movflt(Address(rbp, -wordSize), xmm0); 1269 break; 1270 case T_DOUBLE: 1271 __ movdbl(Address(rbp, -wordSize), xmm0); 1272 break; 1273 case T_VOID: break; 1274 default: { 1275 __ movptr(Address(rbp, -wordSize), rax); 1276 } 1277 } 1278 } 1279 1280 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1281 // We always ignore the frame_slots arg and just use the space just below frame pointer 1282 // which by this time is free to use 1283 switch (ret_type) { 1284 case T_FLOAT: 1285 __ movflt(xmm0, Address(rbp, -wordSize)); 1286 break; 1287 case T_DOUBLE: 1288 __ movdbl(xmm0, Address(rbp, -wordSize)); 1289 break; 1290 case T_VOID: break; 1291 default: { 1292 __ movptr(rax, Address(rbp, -wordSize)); 1293 } 1294 } 1295 } 1296 1297 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1298 for ( int i = first_arg ; i < arg_count ; i++ ) { 1299 if (args[i].first()->is_Register()) { 1300 __ push(args[i].first()->as_Register()); 1301 } else if (args[i].first()->is_XMMRegister()) { 1302 __ subptr(rsp, 2*wordSize); 1303 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister()); 1304 } 1305 } 1306 } 1307 1308 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1309 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) { 1310 if (args[i].first()->is_Register()) { 1311 __ pop(args[i].first()->as_Register()); 1312 } else if (args[i].first()->is_XMMRegister()) { 1313 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0)); 1314 __ addptr(rsp, 2*wordSize); 1315 } 1316 } 1317 } 1318 1319 static void verify_oop_args(MacroAssembler* masm, 1320 const methodHandle& method, 1321 const BasicType* sig_bt, 1322 const VMRegPair* regs) { 1323 Register temp_reg = rbx; // not part of any compiled calling seq 1324 if (VerifyOops) { 1325 for (int i = 0; i < method->size_of_parameters(); i++) { 1326 if (is_reference_type(sig_bt[i])) { 1327 VMReg r = regs[i].first(); 1328 assert(r->is_valid(), "bad oop arg"); 1329 if (r->is_stack()) { 1330 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1331 __ verify_oop(temp_reg); 1332 } else { 1333 __ verify_oop(r->as_Register()); 1334 } 1335 } 1336 } 1337 } 1338 } 1339 1340 static void check_continuation_enter_argument(VMReg actual_vmreg, 1341 Register expected_reg, 1342 const char* name) { 1343 assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name); 1344 assert(actual_vmreg->as_Register() == expected_reg, 1345 "%s is in unexpected register: %s instead of %s", 1346 name, actual_vmreg->as_Register()->name(), expected_reg->name()); 1347 } 1348 1349 1350 //---------------------------- continuation_enter_setup --------------------------- 1351 // 1352 // Arguments: 1353 // None. 1354 // 1355 // Results: 1356 // rsp: pointer to blank ContinuationEntry 1357 // 1358 // Kills: 1359 // rax 1360 // 1361 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) { 1362 assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, ""); 1363 assert(in_bytes(ContinuationEntry::cont_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1364 assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1365 1366 stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize; 1367 __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1368 1369 int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size; 1370 OopMap* map = new OopMap(frame_size, 0); 1371 1372 __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset())); 1373 __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax); 1374 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp); 1375 1376 return map; 1377 } 1378 1379 //---------------------------- fill_continuation_entry --------------------------- 1380 // 1381 // Arguments: 1382 // rsp: pointer to blank Continuation entry 1383 // reg_cont_obj: pointer to the continuation 1384 // reg_flags: flags 1385 // 1386 // Results: 1387 // rsp: pointer to filled out ContinuationEntry 1388 // 1389 // Kills: 1390 // rax 1391 // 1392 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) { 1393 assert_different_registers(rax, reg_cont_obj, reg_flags); 1394 #ifdef ASSERT 1395 __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value()); 1396 #endif 1397 __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj); 1398 __ movl (Address(rsp, ContinuationEntry::flags_offset()), reg_flags); 1399 __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0); 1400 __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0); 1401 __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0); 1402 1403 __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset())); 1404 __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax); 1405 __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset())); 1406 __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax); 1407 1408 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0); 1409 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0); 1410 } 1411 1412 //---------------------------- continuation_enter_cleanup --------------------------- 1413 // 1414 // Arguments: 1415 // rsp: pointer to the ContinuationEntry 1416 // 1417 // Results: 1418 // rsp: pointer to the spilled rbp in the entry frame 1419 // 1420 // Kills: 1421 // rbx 1422 // 1423 static void continuation_enter_cleanup(MacroAssembler* masm) { 1424 #ifdef ASSERT 1425 Label L_good_sp; 1426 __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1427 __ jcc(Assembler::equal, L_good_sp); 1428 __ stop("Incorrect rsp at continuation_enter_cleanup"); 1429 __ bind(L_good_sp); 1430 #endif 1431 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset())); 1432 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx); 1433 1434 if (CheckJNICalls) { 1435 // Check if this is a virtual thread continuation 1436 Label L_skip_vthread_code; 1437 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0); 1438 __ jcc(Assembler::equal, L_skip_vthread_code); 1439 1440 // If the held monitor count is > 0 and this vthread is terminating then 1441 // it failed to release a JNI monitor. So we issue the same log message 1442 // that JavaThread::exit does. 1443 __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1444 __ jcc(Assembler::equal, L_skip_vthread_code); 1445 1446 // rax may hold an exception oop, save it before the call 1447 __ push(rax); 1448 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held)); 1449 __ pop(rax); 1450 1451 // For vthreads we have to explicitly zero the JNI monitor count of the carrier 1452 // on termination. The held count is implicitly zeroed below when we restore from 1453 // the parent held count (which has to be zero). 1454 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1455 1456 __ bind(L_skip_vthread_code); 1457 } 1458 #ifdef ASSERT 1459 else { 1460 // Check if this is a virtual thread continuation 1461 Label L_skip_vthread_code; 1462 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0); 1463 __ jcc(Assembler::equal, L_skip_vthread_code); 1464 1465 // See comment just above. If not checking JNI calls the JNI count is only 1466 // needed for assertion checking. 1467 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1468 1469 __ bind(L_skip_vthread_code); 1470 } 1471 #endif 1472 1473 __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset())); 1474 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx); 1475 1476 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset())); 1477 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx); 1478 __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1479 } 1480 1481 static void gen_continuation_enter(MacroAssembler* masm, 1482 const VMRegPair* regs, 1483 int& exception_offset, 1484 OopMapSet* oop_maps, 1485 int& frame_complete, 1486 int& stack_slots, 1487 int& interpreted_entry_offset, 1488 int& compiled_entry_offset) { 1489 1490 // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread) 1491 int pos_cont_obj = 0; 1492 int pos_is_cont = 1; 1493 int pos_is_virtual = 2; 1494 1495 // The platform-specific calling convention may present the arguments in various registers. 1496 // To simplify the rest of the code, we expect the arguments to reside at these known 1497 // registers, and we additionally check the placement here in case calling convention ever 1498 // changes. 1499 Register reg_cont_obj = c_rarg1; 1500 Register reg_is_cont = c_rarg2; 1501 Register reg_is_virtual = c_rarg3; 1502 1503 check_continuation_enter_argument(regs[pos_cont_obj].first(), reg_cont_obj, "Continuation object"); 1504 check_continuation_enter_argument(regs[pos_is_cont].first(), reg_is_cont, "isContinue"); 1505 check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread"); 1506 1507 // Utility methods kill rax, make sure there are no collisions 1508 assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual); 1509 1510 AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(), 1511 relocInfo::static_call_type); 1512 1513 address start = __ pc(); 1514 1515 Label L_thaw, L_exit; 1516 1517 // i2i entry used at interp_only_mode only 1518 interpreted_entry_offset = __ pc() - start; 1519 { 1520 #ifdef ASSERT 1521 Label is_interp_only; 1522 __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0); 1523 __ jcc(Assembler::notEqual, is_interp_only); 1524 __ stop("enterSpecial interpreter entry called when not in interp_only_mode"); 1525 __ bind(is_interp_only); 1526 #endif 1527 1528 __ pop(rax); // return address 1529 // Read interpreter arguments into registers (this is an ad-hoc i2c adapter) 1530 __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2)); 1531 __ movl(c_rarg2, Address(rsp, Interpreter::stackElementSize*1)); 1532 __ movl(c_rarg3, Address(rsp, Interpreter::stackElementSize*0)); 1533 __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment 1534 __ push(rax); // return address 1535 __ push_cont_fastpath(); 1536 1537 __ enter(); 1538 1539 stack_slots = 2; // will be adjusted in setup 1540 OopMap* map = continuation_enter_setup(masm, stack_slots); 1541 // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe, 1542 // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway. 1543 1544 __ verify_oop(reg_cont_obj); 1545 1546 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1547 1548 // If continuation, call to thaw. Otherwise, resolve the call and exit. 1549 __ testptr(reg_is_cont, reg_is_cont); 1550 __ jcc(Assembler::notZero, L_thaw); 1551 1552 // --- Resolve path 1553 1554 // Make sure the call is patchable 1555 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1556 // Emit stub for static call 1557 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc()); 1558 if (stub == nullptr) { 1559 fatal("CodeCache is full at gen_continuation_enter"); 1560 } 1561 __ call(resolve); 1562 oop_maps->add_gc_map(__ pc() - start, map); 1563 __ post_call_nop(); 1564 1565 __ jmp(L_exit); 1566 } 1567 1568 // compiled entry 1569 __ align(CodeEntryAlignment); 1570 compiled_entry_offset = __ pc() - start; 1571 __ enter(); 1572 1573 stack_slots = 2; // will be adjusted in setup 1574 OopMap* map = continuation_enter_setup(masm, stack_slots); 1575 1576 // Frame is now completed as far as size and linkage. 1577 frame_complete = __ pc() - start; 1578 1579 __ verify_oop(reg_cont_obj); 1580 1581 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1582 1583 // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue) 1584 __ testptr(reg_is_cont, reg_is_cont); 1585 __ jccb(Assembler::notZero, L_thaw); 1586 1587 // --- call Continuation.enter(Continuation c, boolean isContinue) 1588 1589 // Make sure the call is patchable 1590 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1591 1592 // Emit stub for static call 1593 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc()); 1594 if (stub == nullptr) { 1595 fatal("CodeCache is full at gen_continuation_enter"); 1596 } 1597 1598 // The call needs to be resolved. There's a special case for this in 1599 // SharedRuntime::find_callee_info_helper() which calls 1600 // LinkResolver::resolve_continuation_enter() which resolves the call to 1601 // Continuation.enter(Continuation c, boolean isContinue). 1602 __ call(resolve); 1603 1604 oop_maps->add_gc_map(__ pc() - start, map); 1605 __ post_call_nop(); 1606 1607 __ jmpb(L_exit); 1608 1609 // --- Thawing path 1610 1611 __ bind(L_thaw); 1612 1613 ContinuationEntry::_thaw_call_pc_offset = __ pc() - start; 1614 __ call(RuntimeAddress(StubRoutines::cont_thaw())); 1615 1616 ContinuationEntry::_return_pc_offset = __ pc() - start; 1617 oop_maps->add_gc_map(__ pc() - start, map->deep_copy()); 1618 __ post_call_nop(); 1619 1620 // --- Normal exit (resolve/thawing) 1621 1622 __ bind(L_exit); 1623 ContinuationEntry::_cleanup_offset = __ pc() - start; 1624 continuation_enter_cleanup(masm); 1625 __ pop(rbp); 1626 __ ret(0); 1627 1628 // --- Exception handling path 1629 1630 exception_offset = __ pc() - start; 1631 1632 continuation_enter_cleanup(masm); 1633 __ pop(rbp); 1634 1635 __ movptr(c_rarg0, r15_thread); 1636 __ movptr(c_rarg1, Address(rsp, 0)); // return address 1637 1638 // rax still holds the original exception oop, save it before the call 1639 __ push(rax); 1640 1641 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2); 1642 __ movptr(rbx, rax); 1643 1644 // Continue at exception handler: 1645 // rax: exception oop 1646 // rbx: exception handler 1647 // rdx: exception pc 1648 __ pop(rax); 1649 __ verify_oop(rax); 1650 __ pop(rdx); 1651 __ jmp(rbx); 1652 } 1653 1654 static void gen_continuation_yield(MacroAssembler* masm, 1655 const VMRegPair* regs, 1656 OopMapSet* oop_maps, 1657 int& frame_complete, 1658 int& stack_slots, 1659 int& compiled_entry_offset) { 1660 enum layout { 1661 rbp_off, 1662 rbpH_off, 1663 return_off, 1664 return_off2, 1665 framesize // inclusive of return address 1666 }; 1667 stack_slots = framesize / VMRegImpl::slots_per_word; 1668 assert(stack_slots == 2, "recheck layout"); 1669 1670 address start = __ pc(); 1671 compiled_entry_offset = __ pc() - start; 1672 __ enter(); 1673 address the_pc = __ pc(); 1674 1675 frame_complete = the_pc - start; 1676 1677 // This nop must be exactly at the PC we push into the frame info. 1678 // We use this nop for fast CodeBlob lookup, associate the OopMap 1679 // with it right away. 1680 __ post_call_nop(); 1681 OopMap* map = new OopMap(framesize, 1); 1682 oop_maps->add_gc_map(frame_complete, map); 1683 1684 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 1685 __ movptr(c_rarg0, r15_thread); 1686 __ movptr(c_rarg1, rsp); 1687 __ call_VM_leaf(Continuation::freeze_entry(), 2); 1688 __ reset_last_Java_frame(true); 1689 1690 Label L_pinned; 1691 1692 __ testptr(rax, rax); 1693 __ jcc(Assembler::notZero, L_pinned); 1694 1695 __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1696 continuation_enter_cleanup(masm); 1697 __ pop(rbp); 1698 __ ret(0); 1699 1700 __ bind(L_pinned); 1701 1702 // Pinned, return to caller 1703 1704 // handle pending exception thrown by freeze 1705 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 1706 Label ok; 1707 __ jcc(Assembler::equal, ok); 1708 __ leave(); 1709 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 1710 __ bind(ok); 1711 1712 __ leave(); 1713 __ ret(0); 1714 } 1715 1716 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) { 1717 ::continuation_enter_cleanup(masm); 1718 } 1719 1720 static void gen_special_dispatch(MacroAssembler* masm, 1721 const methodHandle& method, 1722 const BasicType* sig_bt, 1723 const VMRegPair* regs) { 1724 verify_oop_args(masm, method, sig_bt, regs); 1725 vmIntrinsics::ID iid = method->intrinsic_id(); 1726 1727 // Now write the args into the outgoing interpreter space 1728 bool has_receiver = false; 1729 Register receiver_reg = noreg; 1730 int member_arg_pos = -1; 1731 Register member_reg = noreg; 1732 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid); 1733 if (ref_kind != 0) { 1734 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument 1735 member_reg = rbx; // known to be free at this point 1736 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind); 1737 } else if (iid == vmIntrinsics::_invokeBasic) { 1738 has_receiver = true; 1739 } else if (iid == vmIntrinsics::_linkToNative) { 1740 member_arg_pos = method->size_of_parameters() - 1; // trailing NativeEntryPoint argument 1741 member_reg = rbx; // known to be free at this point 1742 } else { 1743 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid)); 1744 } 1745 1746 if (member_reg != noreg) { 1747 // Load the member_arg into register, if necessary. 1748 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs); 1749 VMReg r = regs[member_arg_pos].first(); 1750 if (r->is_stack()) { 1751 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1752 } else { 1753 // no data motion is needed 1754 member_reg = r->as_Register(); 1755 } 1756 } 1757 1758 if (has_receiver) { 1759 // Make sure the receiver is loaded into a register. 1760 assert(method->size_of_parameters() > 0, "oob"); 1761 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object"); 1762 VMReg r = regs[0].first(); 1763 assert(r->is_valid(), "bad receiver arg"); 1764 if (r->is_stack()) { 1765 // Porting note: This assumes that compiled calling conventions always 1766 // pass the receiver oop in a register. If this is not true on some 1767 // platform, pick a temp and load the receiver from stack. 1768 fatal("receiver always in a register"); 1769 receiver_reg = j_rarg0; // known to be free at this point 1770 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1771 } else { 1772 // no data motion is needed 1773 receiver_reg = r->as_Register(); 1774 } 1775 } 1776 1777 // Figure out which address we are really jumping to: 1778 MethodHandles::generate_method_handle_dispatch(masm, iid, 1779 receiver_reg, member_reg, /*for_compiler_entry:*/ true); 1780 } 1781 1782 // --------------------------------------------------------------------------- 1783 // Generate a native wrapper for a given method. The method takes arguments 1784 // in the Java compiled code convention, marshals them to the native 1785 // convention (handlizes oops, etc), transitions to native, makes the call, 1786 // returns to java state (possibly blocking), unhandlizes any result and 1787 // returns. 1788 // 1789 // Critical native functions are a shorthand for the use of 1790 // GetPrimtiveArrayCritical and disallow the use of any other JNI 1791 // functions. The wrapper is expected to unpack the arguments before 1792 // passing them to the callee. Critical native functions leave the state _in_Java, 1793 // since they cannot stop for GC. 1794 // Some other parts of JNI setup are skipped like the tear down of the JNI handle 1795 // block and the check for pending exceptions it's impossible for them 1796 // to be thrown. 1797 // 1798 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm, 1799 const methodHandle& method, 1800 int compile_id, 1801 BasicType* in_sig_bt, 1802 VMRegPair* in_regs, 1803 BasicType ret_type) { 1804 if (method->is_continuation_native_intrinsic()) { 1805 int exception_offset = -1; 1806 OopMapSet* oop_maps = new OopMapSet(); 1807 int frame_complete = -1; 1808 int stack_slots = -1; 1809 int interpreted_entry_offset = -1; 1810 int vep_offset = -1; 1811 if (method->is_continuation_enter_intrinsic()) { 1812 gen_continuation_enter(masm, 1813 in_regs, 1814 exception_offset, 1815 oop_maps, 1816 frame_complete, 1817 stack_slots, 1818 interpreted_entry_offset, 1819 vep_offset); 1820 } else if (method->is_continuation_yield_intrinsic()) { 1821 gen_continuation_yield(masm, 1822 in_regs, 1823 oop_maps, 1824 frame_complete, 1825 stack_slots, 1826 vep_offset); 1827 } else { 1828 guarantee(false, "Unknown Continuation native intrinsic"); 1829 } 1830 1831 #ifdef ASSERT 1832 if (method->is_continuation_enter_intrinsic()) { 1833 assert(interpreted_entry_offset != -1, "Must be set"); 1834 assert(exception_offset != -1, "Must be set"); 1835 } else { 1836 assert(interpreted_entry_offset == -1, "Must be unset"); 1837 assert(exception_offset == -1, "Must be unset"); 1838 } 1839 assert(frame_complete != -1, "Must be set"); 1840 assert(stack_slots != -1, "Must be set"); 1841 assert(vep_offset != -1, "Must be set"); 1842 #endif 1843 1844 __ flush(); 1845 nmethod* nm = nmethod::new_native_nmethod(method, 1846 compile_id, 1847 masm->code(), 1848 vep_offset, 1849 frame_complete, 1850 stack_slots, 1851 in_ByteSize(-1), 1852 in_ByteSize(-1), 1853 oop_maps, 1854 exception_offset); 1855 if (nm == nullptr) return nm; 1856 if (method->is_continuation_enter_intrinsic()) { 1857 ContinuationEntry::set_enter_code(nm, interpreted_entry_offset); 1858 } else if (method->is_continuation_yield_intrinsic()) { 1859 _cont_doYield_stub = nm; 1860 } 1861 return nm; 1862 } 1863 1864 if (method->is_method_handle_intrinsic()) { 1865 vmIntrinsics::ID iid = method->intrinsic_id(); 1866 intptr_t start = (intptr_t)__ pc(); 1867 int vep_offset = ((intptr_t)__ pc()) - start; 1868 gen_special_dispatch(masm, 1869 method, 1870 in_sig_bt, 1871 in_regs); 1872 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period 1873 __ flush(); 1874 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually 1875 return nmethod::new_native_nmethod(method, 1876 compile_id, 1877 masm->code(), 1878 vep_offset, 1879 frame_complete, 1880 stack_slots / VMRegImpl::slots_per_word, 1881 in_ByteSize(-1), 1882 in_ByteSize(-1), 1883 nullptr); 1884 } 1885 address native_func = method->native_function(); 1886 assert(native_func != nullptr, "must have function"); 1887 1888 // An OopMap for lock (and class if static) 1889 OopMapSet *oop_maps = new OopMapSet(); 1890 intptr_t start = (intptr_t)__ pc(); 1891 1892 // We have received a description of where all the java arg are located 1893 // on entry to the wrapper. We need to convert these args to where 1894 // the jni function will expect them. To figure out where they go 1895 // we convert the java signature to a C signature by inserting 1896 // the hidden arguments as arg[0] and possibly arg[1] (static method) 1897 1898 const int total_in_args = method->size_of_parameters(); 1899 int total_c_args = total_in_args + (method->is_static() ? 2 : 1); 1900 1901 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args); 1902 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args); 1903 1904 int argc = 0; 1905 out_sig_bt[argc++] = T_ADDRESS; 1906 if (method->is_static()) { 1907 out_sig_bt[argc++] = T_OBJECT; 1908 } 1909 1910 for (int i = 0; i < total_in_args ; i++ ) { 1911 out_sig_bt[argc++] = in_sig_bt[i]; 1912 } 1913 1914 // Now figure out where the args must be stored and how much stack space 1915 // they require. 1916 int out_arg_slots; 1917 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args); 1918 1919 // Compute framesize for the wrapper. We need to handlize all oops in 1920 // incoming registers 1921 1922 // Calculate the total number of stack slots we will need. 1923 1924 // First count the abi requirement plus all of the outgoing args 1925 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots; 1926 1927 // Now the space for the inbound oop handle area 1928 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers 1929 1930 int oop_handle_offset = stack_slots; 1931 stack_slots += total_save_slots; 1932 1933 // Now any space we need for handlizing a klass if static method 1934 1935 int klass_slot_offset = 0; 1936 int klass_offset = -1; 1937 int lock_slot_offset = 0; 1938 bool is_static = false; 1939 1940 if (method->is_static()) { 1941 klass_slot_offset = stack_slots; 1942 stack_slots += VMRegImpl::slots_per_word; 1943 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size; 1944 is_static = true; 1945 } 1946 1947 // Plus a lock if needed 1948 1949 if (method->is_synchronized()) { 1950 lock_slot_offset = stack_slots; 1951 stack_slots += VMRegImpl::slots_per_word; 1952 } 1953 1954 // Now a place (+2) to save return values or temp during shuffling 1955 // + 4 for return address (which we own) and saved rbp 1956 stack_slots += 6; 1957 1958 // Ok The space we have allocated will look like: 1959 // 1960 // 1961 // FP-> | | 1962 // |---------------------| 1963 // | 2 slots for moves | 1964 // |---------------------| 1965 // | lock box (if sync) | 1966 // |---------------------| <- lock_slot_offset 1967 // | klass (if static) | 1968 // |---------------------| <- klass_slot_offset 1969 // | oopHandle area | 1970 // |---------------------| <- oop_handle_offset (6 java arg registers) 1971 // | outbound memory | 1972 // | based arguments | 1973 // | | 1974 // |---------------------| 1975 // | | 1976 // SP-> | out_preserved_slots | 1977 // 1978 // 1979 1980 1981 // Now compute actual number of stack words we need rounding to make 1982 // stack properly aligned. 1983 stack_slots = align_up(stack_slots, StackAlignmentInSlots); 1984 1985 int stack_size = stack_slots * VMRegImpl::stack_slot_size; 1986 1987 // First thing make an ic check to see if we should even be here 1988 1989 // We are free to use all registers as temps without saving them and 1990 // restoring them except rbp. rbp is the only callee save register 1991 // as far as the interpreter and the compiler(s) are concerned. 1992 1993 const Register receiver = j_rarg0; 1994 1995 Label exception_pending; 1996 1997 assert_different_registers(receiver, rscratch1, rscratch2); 1998 __ verify_oop(receiver); 1999 __ ic_check(8 /* end_alignment */); 2000 2001 int vep_offset = ((intptr_t)__ pc()) - start; 2002 2003 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) { 2004 Label L_skip_barrier; 2005 Register klass = r10; 2006 __ mov_metadata(klass, method->method_holder()); // InstanceKlass* 2007 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 2008 2009 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 2010 2011 __ bind(L_skip_barrier); 2012 } 2013 2014 #ifdef COMPILER1 2015 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available. 2016 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) { 2017 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/); 2018 } 2019 #endif // COMPILER1 2020 2021 // The instruction at the verified entry point must be 5 bytes or longer 2022 // because it can be patched on the fly by make_non_entrant. The stack bang 2023 // instruction fits that requirement. 2024 2025 // Generate stack overflow check 2026 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size()); 2027 2028 // Generate a new frame for the wrapper. 2029 __ enter(); 2030 // -2 because return address is already present and so is saved rbp 2031 __ subptr(rsp, stack_size - 2*wordSize); 2032 2033 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2034 // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub 2035 bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */); 2036 2037 // Frame is now completed as far as size and linkage. 2038 int frame_complete = ((intptr_t)__ pc()) - start; 2039 2040 #ifdef ASSERT 2041 __ check_stack_alignment(rsp, "improperly aligned stack"); 2042 #endif /* ASSERT */ 2043 2044 2045 // We use r14 as the oop handle for the receiver/klass 2046 // It is callee save so it survives the call to native 2047 2048 const Register oop_handle_reg = r14; 2049 2050 // 2051 // We immediately shuffle the arguments so that any vm call we have to 2052 // make from here on out (sync slow path, jvmti, etc.) we will have 2053 // captured the oops from our caller and have a valid oopMap for 2054 // them. 2055 2056 // ----------------- 2057 // The Grand Shuffle 2058 2059 // The Java calling convention is either equal (linux) or denser (win64) than the 2060 // c calling convention. However the because of the jni_env argument the c calling 2061 // convention always has at least one more (and two for static) arguments than Java. 2062 // Therefore if we move the args from java -> c backwards then we will never have 2063 // a register->register conflict and we don't have to build a dependency graph 2064 // and figure out how to break any cycles. 2065 // 2066 2067 // Record esp-based slot for receiver on stack for non-static methods 2068 int receiver_offset = -1; 2069 2070 // This is a trick. We double the stack slots so we can claim 2071 // the oops in the caller's frame. Since we are sure to have 2072 // more args than the caller doubling is enough to make 2073 // sure we can capture all the incoming oop args from the 2074 // caller. 2075 // 2076 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/); 2077 2078 // Mark location of rbp (someday) 2079 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp)); 2080 2081 // Use eax, ebx as temporaries during any memory-memory moves we have to do 2082 // All inbound args are referenced based on rbp and all outbound args via rsp. 2083 2084 2085 #ifdef ASSERT 2086 bool reg_destroyed[Register::number_of_registers]; 2087 bool freg_destroyed[XMMRegister::number_of_registers]; 2088 for ( int r = 0 ; r < Register::number_of_registers ; r++ ) { 2089 reg_destroyed[r] = false; 2090 } 2091 for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) { 2092 freg_destroyed[f] = false; 2093 } 2094 2095 #endif /* ASSERT */ 2096 2097 // For JNI natives the incoming and outgoing registers are offset upwards. 2098 GrowableArray<int> arg_order(2 * total_in_args); 2099 2100 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 2101 arg_order.push(i); 2102 arg_order.push(c_arg); 2103 } 2104 2105 for (int ai = 0; ai < arg_order.length(); ai += 2) { 2106 int i = arg_order.at(ai); 2107 int c_arg = arg_order.at(ai + 1); 2108 __ block_comment(err_msg("move %d -> %d", i, c_arg)); 2109 #ifdef ASSERT 2110 if (in_regs[i].first()->is_Register()) { 2111 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!"); 2112 } else if (in_regs[i].first()->is_XMMRegister()) { 2113 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!"); 2114 } 2115 if (out_regs[c_arg].first()->is_Register()) { 2116 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 2117 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 2118 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 2119 } 2120 #endif /* ASSERT */ 2121 switch (in_sig_bt[i]) { 2122 case T_ARRAY: 2123 case T_OBJECT: 2124 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg], 2125 ((i == 0) && (!is_static)), 2126 &receiver_offset); 2127 break; 2128 case T_VOID: 2129 break; 2130 2131 case T_FLOAT: 2132 __ float_move(in_regs[i], out_regs[c_arg]); 2133 break; 2134 2135 case T_DOUBLE: 2136 assert( i + 1 < total_in_args && 2137 in_sig_bt[i + 1] == T_VOID && 2138 out_sig_bt[c_arg+1] == T_VOID, "bad arg list"); 2139 __ double_move(in_regs[i], out_regs[c_arg]); 2140 break; 2141 2142 case T_LONG : 2143 __ long_move(in_regs[i], out_regs[c_arg]); 2144 break; 2145 2146 case T_ADDRESS: assert(false, "found T_ADDRESS in java args"); 2147 2148 default: 2149 __ move32_64(in_regs[i], out_regs[c_arg]); 2150 } 2151 } 2152 2153 int c_arg; 2154 2155 // Pre-load a static method's oop into r14. Used both by locking code and 2156 // the normal JNI call code. 2157 // point c_arg at the first arg that is already loaded in case we 2158 // need to spill before we call out 2159 c_arg = total_c_args - total_in_args; 2160 2161 if (method->is_static()) { 2162 2163 // load oop into a register 2164 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror())); 2165 2166 // Now handlize the static class mirror it's known not-null. 2167 __ movptr(Address(rsp, klass_offset), oop_handle_reg); 2168 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset)); 2169 2170 // Now get the handle 2171 __ lea(oop_handle_reg, Address(rsp, klass_offset)); 2172 // store the klass handle as second argument 2173 __ movptr(c_rarg1, oop_handle_reg); 2174 // and protect the arg if we must spill 2175 c_arg--; 2176 } 2177 2178 // Change state to native (we save the return address in the thread, since it might not 2179 // be pushed on the stack when we do a stack traversal). It is enough that the pc() 2180 // points into the right code segment. It does not have to be the correct return pc. 2181 // We use the same pc/oopMap repeatedly when we call out 2182 2183 Label native_return; 2184 if (LockingMode != LM_LEGACY && method->is_object_wait0()) { 2185 // For convenience we use the pc we want to resume to in case of preemption on Object.wait. 2186 __ set_last_Java_frame(rsp, noreg, native_return, rscratch1); 2187 } else { 2188 intptr_t the_pc = (intptr_t) __ pc(); 2189 oop_maps->add_gc_map(the_pc - start, map); 2190 2191 __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1); 2192 } 2193 2194 // We have all of the arguments setup at this point. We must not touch any register 2195 // argument registers at this point (what if we save/restore them there are no oop? 2196 2197 if (DTraceMethodProbes) { 2198 // protect the args we've loaded 2199 save_args(masm, total_c_args, c_arg, out_regs); 2200 __ mov_metadata(c_rarg1, method()); 2201 __ call_VM_leaf( 2202 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry), 2203 r15_thread, c_rarg1); 2204 restore_args(masm, total_c_args, c_arg, out_regs); 2205 } 2206 2207 // RedefineClasses() tracing support for obsolete method entry 2208 if (log_is_enabled(Trace, redefine, class, obsolete)) { 2209 // protect the args we've loaded 2210 save_args(masm, total_c_args, c_arg, out_regs); 2211 __ mov_metadata(c_rarg1, method()); 2212 __ call_VM_leaf( 2213 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry), 2214 r15_thread, c_rarg1); 2215 restore_args(masm, total_c_args, c_arg, out_regs); 2216 } 2217 2218 // Lock a synchronized method 2219 2220 // Register definitions used by locking and unlocking 2221 2222 const Register swap_reg = rax; // Must use rax for cmpxchg instruction 2223 const Register obj_reg = rbx; // Will contain the oop 2224 const Register lock_reg = r13; // Address of compiler lock object (BasicLock) 2225 const Register old_hdr = r13; // value of old header at unlock time 2226 2227 Label slow_path_lock; 2228 Label lock_done; 2229 2230 if (method->is_synchronized()) { 2231 Label count_mon; 2232 2233 const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes(); 2234 2235 // Get the handle (the 2nd argument) 2236 __ mov(oop_handle_reg, c_rarg1); 2237 2238 // Get address of the box 2239 2240 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2241 2242 // Load the oop from the handle 2243 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2244 2245 if (LockingMode == LM_MONITOR) { 2246 __ jmp(slow_path_lock); 2247 } else if (LockingMode == LM_LEGACY) { 2248 // Load immediate 1 into swap_reg %rax 2249 __ movl(swap_reg, 1); 2250 2251 // Load (object->mark() | 1) into swap_reg %rax 2252 __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2253 2254 // Save (object->mark() | 1) into BasicLock's displaced header 2255 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2256 2257 // src -> dest iff dest == rax else rax <- dest 2258 __ lock(); 2259 __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2260 __ jcc(Assembler::equal, count_mon); 2261 2262 // Hmm should this move to the slow path code area??? 2263 2264 // Test if the oopMark is an obvious stack pointer, i.e., 2265 // 1) (mark & 3) == 0, and 2266 // 2) rsp <= mark < mark + os::pagesize() 2267 // These 3 tests can be done by evaluating the following 2268 // expression: ((mark - rsp) & (3 - os::vm_page_size())), 2269 // assuming both stack pointer and pagesize have their 2270 // least significant 2 bits clear. 2271 // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg 2272 2273 __ subptr(swap_reg, rsp); 2274 __ andptr(swap_reg, 3 - (int)os::vm_page_size()); 2275 2276 // Save the test result, for recursive case, the result is zero 2277 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2278 __ jcc(Assembler::notEqual, slow_path_lock); 2279 2280 __ bind(count_mon); 2281 __ inc_held_monitor_count(); 2282 } else { 2283 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2284 __ lightweight_lock(lock_reg, obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock); 2285 } 2286 2287 // Slow path will re-enter here 2288 __ bind(lock_done); 2289 } 2290 2291 // Finally just about ready to make the JNI call 2292 2293 // get JNIEnv* which is first argument to native 2294 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset()))); 2295 2296 // Now set thread in native 2297 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 2298 2299 __ call(RuntimeAddress(native_func)); 2300 2301 // Verify or restore cpu control state after JNI call 2302 __ restore_cpu_control_state_after_jni(rscratch1); 2303 2304 // Unpack native results. 2305 switch (ret_type) { 2306 case T_BOOLEAN: __ c2bool(rax); break; 2307 case T_CHAR : __ movzwl(rax, rax); break; 2308 case T_BYTE : __ sign_extend_byte (rax); break; 2309 case T_SHORT : __ sign_extend_short(rax); break; 2310 case T_INT : /* nothing to do */ break; 2311 case T_DOUBLE : 2312 case T_FLOAT : 2313 // Result is in xmm0 we'll save as needed 2314 break; 2315 case T_ARRAY: // Really a handle 2316 case T_OBJECT: // Really a handle 2317 break; // can't de-handlize until after safepoint check 2318 case T_VOID: break; 2319 case T_LONG: break; 2320 default : ShouldNotReachHere(); 2321 } 2322 2323 // Switch thread to "native transition" state before reading the synchronization state. 2324 // This additional state is necessary because reading and testing the synchronization 2325 // state is not atomic w.r.t. GC, as this scenario demonstrates: 2326 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted. 2327 // VM thread changes sync state to synchronizing and suspends threads for GC. 2328 // Thread A is resumed to finish this native method, but doesn't block here since it 2329 // didn't see any synchronization is progress, and escapes. 2330 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 2331 2332 // Force this write out before the read below 2333 if (!UseSystemMemoryBarrier) { 2334 __ membar(Assembler::Membar_mask_bits( 2335 Assembler::LoadLoad | Assembler::LoadStore | 2336 Assembler::StoreLoad | Assembler::StoreStore)); 2337 } 2338 2339 // check for safepoint operation in progress and/or pending suspend requests 2340 { 2341 Label Continue; 2342 Label slow_path; 2343 2344 __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */); 2345 2346 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2347 __ jcc(Assembler::equal, Continue); 2348 __ bind(slow_path); 2349 2350 // Don't use call_VM as it will see a possible pending exception and forward it 2351 // and never return here preventing us from clearing _last_native_pc down below. 2352 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are 2353 // preserved and correspond to the bcp/locals pointers. So we do a runtime call 2354 // by hand. 2355 // 2356 __ vzeroupper(); 2357 save_native_result(masm, ret_type, stack_slots); 2358 __ mov(c_rarg0, r15_thread); 2359 __ mov(r12, rsp); // remember sp 2360 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2361 __ andptr(rsp, -16); // align stack as required by ABI 2362 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 2363 __ mov(rsp, r12); // restore sp 2364 __ reinit_heapbase(); 2365 // Restore any method result value 2366 restore_native_result(masm, ret_type, stack_slots); 2367 __ bind(Continue); 2368 } 2369 2370 // change thread state 2371 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 2372 2373 if (LockingMode != LM_LEGACY && method->is_object_wait0()) { 2374 // Check preemption for Object.wait() 2375 __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset())); 2376 __ cmpptr(rscratch1, NULL_WORD); 2377 __ jccb(Assembler::equal, native_return); 2378 __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD); 2379 __ jmp(rscratch1); 2380 __ bind(native_return); 2381 2382 intptr_t the_pc = (intptr_t) __ pc(); 2383 oop_maps->add_gc_map(the_pc - start, map); 2384 } 2385 2386 2387 Label reguard; 2388 Label reguard_done; 2389 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 2390 __ jcc(Assembler::equal, reguard); 2391 __ bind(reguard_done); 2392 2393 // native result if any is live 2394 2395 // Unlock 2396 Label slow_path_unlock; 2397 Label unlock_done; 2398 if (method->is_synchronized()) { 2399 2400 Label fast_done; 2401 2402 // Get locked oop from the handle we passed to jni 2403 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2404 2405 if (LockingMode == LM_LEGACY) { 2406 Label not_recur; 2407 // Simple recursive lock? 2408 __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD); 2409 __ jcc(Assembler::notEqual, not_recur); 2410 __ dec_held_monitor_count(); 2411 __ jmpb(fast_done); 2412 __ bind(not_recur); 2413 } 2414 2415 // Must save rax if it is live now because cmpxchg must use it 2416 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2417 save_native_result(masm, ret_type, stack_slots); 2418 } 2419 2420 if (LockingMode == LM_MONITOR) { 2421 __ jmp(slow_path_unlock); 2422 } else if (LockingMode == LM_LEGACY) { 2423 // get address of the stack lock 2424 __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2425 // get old displaced header 2426 __ movptr(old_hdr, Address(rax, 0)); 2427 2428 // Atomic swap old header if oop still contains the stack lock 2429 __ lock(); 2430 __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2431 __ jcc(Assembler::notEqual, slow_path_unlock); 2432 __ dec_held_monitor_count(); 2433 } else { 2434 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2435 __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock); 2436 } 2437 2438 // slow path re-enters here 2439 __ bind(unlock_done); 2440 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2441 restore_native_result(masm, ret_type, stack_slots); 2442 } 2443 2444 __ bind(fast_done); 2445 } 2446 if (DTraceMethodProbes) { 2447 save_native_result(masm, ret_type, stack_slots); 2448 __ mov_metadata(c_rarg1, method()); 2449 __ call_VM_leaf( 2450 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit), 2451 r15_thread, c_rarg1); 2452 restore_native_result(masm, ret_type, stack_slots); 2453 } 2454 2455 __ reset_last_Java_frame(false); 2456 2457 // Unbox oop result, e.g. JNIHandles::resolve value. 2458 if (is_reference_type(ret_type)) { 2459 __ resolve_jobject(rax /* value */, 2460 r15_thread /* thread */, 2461 rcx /* tmp */); 2462 } 2463 2464 if (CheckJNICalls) { 2465 // clear_pending_jni_exception_check 2466 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD); 2467 } 2468 2469 // reset handle block 2470 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset())); 2471 __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD); 2472 2473 // pop our frame 2474 2475 __ leave(); 2476 2477 // Any exception pending? 2478 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2479 __ jcc(Assembler::notEqual, exception_pending); 2480 2481 // Return 2482 2483 __ ret(0); 2484 2485 // Unexpected paths are out of line and go here 2486 2487 // forward the exception 2488 __ bind(exception_pending); 2489 2490 // and forward the exception 2491 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 2492 2493 // Slow path locking & unlocking 2494 if (method->is_synchronized()) { 2495 2496 // BEGIN Slow path lock 2497 __ bind(slow_path_lock); 2498 2499 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM 2500 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2501 2502 // protect the args we've loaded 2503 save_args(masm, total_c_args, c_arg, out_regs); 2504 2505 __ mov(c_rarg0, obj_reg); 2506 __ mov(c_rarg1, lock_reg); 2507 __ mov(c_rarg2, r15_thread); 2508 2509 // Not a leaf but we have last_Java_frame setup as we want. 2510 // We don't want to unmount in case of contention since that would complicate preserving 2511 // the arguments that had already been marshalled into the native convention. So we force 2512 // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame()) 2513 // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack. 2514 __ push_cont_fastpath(); 2515 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3); 2516 __ pop_cont_fastpath(); 2517 restore_args(masm, total_c_args, c_arg, out_regs); 2518 2519 #ifdef ASSERT 2520 { Label L; 2521 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2522 __ jcc(Assembler::equal, L); 2523 __ stop("no pending exception allowed on exit from monitorenter"); 2524 __ bind(L); 2525 } 2526 #endif 2527 __ jmp(lock_done); 2528 2529 // END Slow path lock 2530 2531 // BEGIN Slow path unlock 2532 __ bind(slow_path_unlock); 2533 2534 // If we haven't already saved the native result we must save it now as xmm registers 2535 // are still exposed. 2536 __ vzeroupper(); 2537 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2538 save_native_result(masm, ret_type, stack_slots); 2539 } 2540 2541 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2542 2543 __ mov(c_rarg0, obj_reg); 2544 __ mov(c_rarg2, r15_thread); 2545 __ mov(r12, rsp); // remember sp 2546 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2547 __ andptr(rsp, -16); // align stack as required by ABI 2548 2549 // Save pending exception around call to VM (which contains an EXCEPTION_MARK) 2550 // NOTE that obj_reg == rbx currently 2551 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset()))); 2552 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2553 2554 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2555 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C))); 2556 __ mov(rsp, r12); // restore sp 2557 __ reinit_heapbase(); 2558 #ifdef ASSERT 2559 { 2560 Label L; 2561 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2562 __ jcc(Assembler::equal, L); 2563 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C"); 2564 __ bind(L); 2565 } 2566 #endif /* ASSERT */ 2567 2568 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx); 2569 2570 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2571 restore_native_result(masm, ret_type, stack_slots); 2572 } 2573 __ jmp(unlock_done); 2574 2575 // END Slow path unlock 2576 2577 } // synchronized 2578 2579 // SLOW PATH Reguard the stack if needed 2580 2581 __ bind(reguard); 2582 __ vzeroupper(); 2583 save_native_result(masm, ret_type, stack_slots); 2584 __ mov(r12, rsp); // remember sp 2585 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2586 __ andptr(rsp, -16); // align stack as required by ABI 2587 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 2588 __ mov(rsp, r12); // restore sp 2589 __ reinit_heapbase(); 2590 restore_native_result(masm, ret_type, stack_slots); 2591 // and continue 2592 __ jmp(reguard_done); 2593 2594 2595 2596 __ flush(); 2597 2598 nmethod *nm = nmethod::new_native_nmethod(method, 2599 compile_id, 2600 masm->code(), 2601 vep_offset, 2602 frame_complete, 2603 stack_slots / VMRegImpl::slots_per_word, 2604 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)), 2605 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size), 2606 oop_maps); 2607 2608 return nm; 2609 } 2610 2611 // this function returns the adjust size (in number of words) to a c2i adapter 2612 // activation for use during deoptimization 2613 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) { 2614 return (callee_locals - callee_parameters) * Interpreter::stackElementWords; 2615 } 2616 2617 2618 uint SharedRuntime::out_preserve_stack_slots() { 2619 return 0; 2620 } 2621 2622 2623 // Number of stack slots between incoming argument block and the start of 2624 // a new frame. The PROLOG must add this many slots to the stack. The 2625 // EPILOG must remove this many slots. amd64 needs two slots for 2626 // return address. 2627 uint SharedRuntime::in_preserve_stack_slots() { 2628 return 4 + 2 * VerifyStackAtCalls; 2629 } 2630 2631 VMReg SharedRuntime::thread_register() { 2632 return r15_thread->as_VMReg(); 2633 } 2634 2635 //------------------------------generate_deopt_blob---------------------------- 2636 void SharedRuntime::generate_deopt_blob() { 2637 // Allocate space for the code 2638 ResourceMark rm; 2639 // Setup code generation tools 2640 int pad = 0; 2641 if (UseAVX > 2) { 2642 pad += 1024; 2643 } 2644 if (UseAPX) { 2645 pad += 1024; 2646 } 2647 #if INCLUDE_JVMCI 2648 if (EnableJVMCI) { 2649 pad += 512; // Increase the buffer size when compiling for JVMCI 2650 } 2651 #endif 2652 const char* name = SharedRuntime::stub_name(SharedStubId::deopt_id); 2653 CodeBuffer buffer(name, 2560+pad, 1024); 2654 MacroAssembler* masm = new MacroAssembler(&buffer); 2655 int frame_size_in_words; 2656 OopMap* map = nullptr; 2657 OopMapSet *oop_maps = new OopMapSet(); 2658 2659 // ------------- 2660 // This code enters when returning to a de-optimized nmethod. A return 2661 // address has been pushed on the stack, and return values are in 2662 // registers. 2663 // If we are doing a normal deopt then we were called from the patched 2664 // nmethod from the point we returned to the nmethod. So the return 2665 // address on the stack is wrong by NativeCall::instruction_size 2666 // We will adjust the value so it looks like we have the original return 2667 // address on the stack (like when we eagerly deoptimized). 2668 // In the case of an exception pending when deoptimizing, we enter 2669 // with a return address on the stack that points after the call we patched 2670 // into the exception handler. We have the following register state from, 2671 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp). 2672 // rax: exception oop 2673 // rbx: exception handler 2674 // rdx: throwing pc 2675 // So in this case we simply jam rdx into the useless return address and 2676 // the stack looks just like we want. 2677 // 2678 // At this point we need to de-opt. We save the argument return 2679 // registers. We call the first C routine, fetch_unroll_info(). This 2680 // routine captures the return values and returns a structure which 2681 // describes the current frame size and the sizes of all replacement frames. 2682 // The current frame is compiled code and may contain many inlined 2683 // functions, each with their own JVM state. We pop the current frame, then 2684 // push all the new frames. Then we call the C routine unpack_frames() to 2685 // populate these frames. Finally unpack_frames() returns us the new target 2686 // address. Notice that callee-save registers are BLOWN here; they have 2687 // already been captured in the vframeArray at the time the return PC was 2688 // patched. 2689 address start = __ pc(); 2690 Label cont; 2691 2692 // Prolog for non exception case! 2693 2694 // Save everything in sight. 2695 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2696 2697 // Normal deoptimization. Save exec mode for unpack_frames. 2698 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved 2699 __ jmp(cont); 2700 2701 int reexecute_offset = __ pc() - start; 2702 #if INCLUDE_JVMCI && !defined(COMPILER1) 2703 if (UseJVMCICompiler) { 2704 // JVMCI does not use this kind of deoptimization 2705 __ should_not_reach_here(); 2706 } 2707 #endif 2708 2709 // Reexecute case 2710 // return address is the pc describes what bci to do re-execute at 2711 2712 // No need to update map as each call to save_live_registers will produce identical oopmap 2713 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2714 2715 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved 2716 __ jmp(cont); 2717 2718 #if INCLUDE_JVMCI 2719 Label after_fetch_unroll_info_call; 2720 int implicit_exception_uncommon_trap_offset = 0; 2721 int uncommon_trap_offset = 0; 2722 2723 if (EnableJVMCI) { 2724 implicit_exception_uncommon_trap_offset = __ pc() - start; 2725 2726 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset()))); 2727 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD); 2728 2729 uncommon_trap_offset = __ pc() - start; 2730 2731 // Save everything in sight. 2732 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2733 // fetch_unroll_info needs to call last_java_frame() 2734 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2735 2736 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset()))); 2737 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1); 2738 2739 __ movl(r14, Deoptimization::Unpack_reexecute); 2740 __ mov(c_rarg0, r15_thread); 2741 __ movl(c_rarg2, r14); // exec mode 2742 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2743 oop_maps->add_gc_map( __ pc()-start, map->deep_copy()); 2744 2745 __ reset_last_Java_frame(false); 2746 2747 __ jmp(after_fetch_unroll_info_call); 2748 } // EnableJVMCI 2749 #endif // INCLUDE_JVMCI 2750 2751 int exception_offset = __ pc() - start; 2752 2753 // Prolog for exception case 2754 2755 // all registers are dead at this entry point, except for rax, and 2756 // rdx which contain the exception oop and exception pc 2757 // respectively. Set them in TLS and fall thru to the 2758 // unpack_with_exception_in_tls entry point. 2759 2760 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 2761 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax); 2762 2763 int exception_in_tls_offset = __ pc() - start; 2764 2765 // new implementation because exception oop is now passed in JavaThread 2766 2767 // Prolog for exception case 2768 // All registers must be preserved because they might be used by LinearScan 2769 // Exceptiop oop and throwing PC are passed in JavaThread 2770 // tos: stack at point of call to method that threw the exception (i.e. only 2771 // args are on the stack, no return address) 2772 2773 // make room on stack for the return address 2774 // It will be patched later with the throwing pc. The correct value is not 2775 // available now because loading it from memory would destroy registers. 2776 __ push(0); 2777 2778 // Save everything in sight. 2779 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2780 2781 // Now it is safe to overwrite any register 2782 2783 // Deopt during an exception. Save exec mode for unpack_frames. 2784 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved 2785 2786 // load throwing pc from JavaThread and patch it as the return address 2787 // of the current frame. Then clear the field in JavaThread 2788 2789 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2790 __ movptr(Address(rbp, wordSize), rdx); 2791 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2792 2793 #ifdef ASSERT 2794 // verify that there is really an exception oop in JavaThread 2795 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2796 __ verify_oop(rax); 2797 2798 // verify that there is no pending exception 2799 Label no_pending_exception; 2800 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 2801 __ testptr(rax, rax); 2802 __ jcc(Assembler::zero, no_pending_exception); 2803 __ stop("must not have pending exception here"); 2804 __ bind(no_pending_exception); 2805 #endif 2806 2807 __ bind(cont); 2808 2809 // Call C code. Need thread and this frame, but NOT official VM entry 2810 // crud. We cannot block on this call, no GC can happen. 2811 // 2812 // UnrollBlock* fetch_unroll_info(JavaThread* thread) 2813 2814 // fetch_unroll_info needs to call last_java_frame(). 2815 2816 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2817 #ifdef ASSERT 2818 { Label L; 2819 __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD); 2820 __ jcc(Assembler::equal, L); 2821 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared"); 2822 __ bind(L); 2823 } 2824 #endif // ASSERT 2825 __ mov(c_rarg0, r15_thread); 2826 __ movl(c_rarg1, r14); // exec_mode 2827 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info))); 2828 2829 // Need to have an oopmap that tells fetch_unroll_info where to 2830 // find any register it might need. 2831 oop_maps->add_gc_map(__ pc() - start, map); 2832 2833 __ reset_last_Java_frame(false); 2834 2835 #if INCLUDE_JVMCI 2836 if (EnableJVMCI) { 2837 __ bind(after_fetch_unroll_info_call); 2838 } 2839 #endif 2840 2841 // Load UnrollBlock* into rdi 2842 __ mov(rdi, rax); 2843 2844 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset())); 2845 Label noException; 2846 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending? 2847 __ jcc(Assembler::notEqual, noException); 2848 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2849 // QQQ this is useless it was null above 2850 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2851 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD); 2852 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2853 2854 __ verify_oop(rax); 2855 2856 // Overwrite the result registers with the exception results. 2857 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2858 // I think this is useless 2859 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx); 2860 2861 __ bind(noException); 2862 2863 // Only register save data is on the stack. 2864 // Now restore the result registers. Everything else is either dead 2865 // or captured in the vframeArray. 2866 RegisterSaver::restore_result_registers(masm); 2867 2868 // All of the register save area has been popped of the stack. Only the 2869 // return address remains. 2870 2871 // Pop all the frames we must move/replace. 2872 // 2873 // Frame picture (youngest to oldest) 2874 // 1: self-frame (no frame link) 2875 // 2: deopting frame (no frame link) 2876 // 3: caller of deopting frame (could be compiled/interpreted). 2877 // 2878 // Note: by leaving the return address of self-frame on the stack 2879 // and using the size of frame 2 to adjust the stack 2880 // when we are done the return to frame 3 will still be on the stack. 2881 2882 // Pop deoptimized frame 2883 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset())); 2884 __ addptr(rsp, rcx); 2885 2886 // rsp should be pointing at the return address to the caller (3) 2887 2888 // Pick up the initial fp we should save 2889 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2890 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset())); 2891 2892 #ifdef ASSERT 2893 // Compilers generate code that bang the stack by as much as the 2894 // interpreter would need. So this stack banging should never 2895 // trigger a fault. Verify that it does not on non product builds. 2896 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset())); 2897 __ bang_stack_size(rbx, rcx); 2898 #endif 2899 2900 // Load address of array of frame pcs into rcx 2901 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset())); 2902 2903 // Trash the old pc 2904 __ addptr(rsp, wordSize); 2905 2906 // Load address of array of frame sizes into rsi 2907 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset())); 2908 2909 // Load counter into rdx 2910 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset())); 2911 2912 // Now adjust the caller's stack to make up for the extra locals 2913 // but record the original sp so that we can save it in the skeletal interpreter 2914 // frame and the stack walking of interpreter_sender will get the unextended sp 2915 // value and not the "real" sp value. 2916 2917 const Register sender_sp = r8; 2918 2919 __ mov(sender_sp, rsp); 2920 __ movl(rbx, Address(rdi, 2921 Deoptimization::UnrollBlock:: 2922 caller_adjustment_offset())); 2923 __ subptr(rsp, rbx); 2924 2925 // Push interpreter frames in a loop 2926 Label loop; 2927 __ bind(loop); 2928 __ movptr(rbx, Address(rsi, 0)); // Load frame size 2929 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand 2930 __ pushptr(Address(rcx, 0)); // Save return address 2931 __ enter(); // Save old & set new ebp 2932 __ subptr(rsp, rbx); // Prolog 2933 // This value is corrected by layout_activation_impl 2934 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD); 2935 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable 2936 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 2937 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 2938 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 2939 __ decrementl(rdx); // Decrement counter 2940 __ jcc(Assembler::notZero, loop); 2941 __ pushptr(Address(rcx, 0)); // Save final return address 2942 2943 // Re-push self-frame 2944 __ enter(); // Save old & set new ebp 2945 2946 // Allocate a full sized register save area. 2947 // Return address and rbp are in place, so we allocate two less words. 2948 __ subptr(rsp, (frame_size_in_words - 2) * wordSize); 2949 2950 // Restore frame locals after moving the frame 2951 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0); 2952 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2953 2954 // Call C code. Need thread but NOT official VM entry 2955 // crud. We cannot block on this call, no GC can happen. Call should 2956 // restore return values to their stack-slots with the new SP. 2957 // 2958 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode) 2959 2960 // Use rbp because the frames look interpreted now 2961 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 2962 // Don't need the precise return PC here, just precise enough to point into this code blob. 2963 address the_pc = __ pc(); 2964 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1); 2965 2966 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI 2967 __ mov(c_rarg0, r15_thread); 2968 __ movl(c_rarg1, r14); // second arg: exec_mode 2969 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 2970 // Revert SP alignment after call since we're going to do some SP relative addressing below 2971 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset())); 2972 2973 // Set an oopmap for the call site 2974 // Use the same PC we used for the last java frame 2975 oop_maps->add_gc_map(the_pc - start, 2976 new OopMap( frame_size_in_words, 0 )); 2977 2978 // Clear fp AND pc 2979 __ reset_last_Java_frame(true); 2980 2981 // Collect return values 2982 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes())); 2983 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes())); 2984 // I think this is useless (throwing pc?) 2985 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes())); 2986 2987 // Pop self-frame. 2988 __ leave(); // Epilog 2989 2990 // Jump to interpreter 2991 __ ret(0); 2992 2993 // Make sure all code is generated 2994 masm->flush(); 2995 2996 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words); 2997 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset); 2998 #if INCLUDE_JVMCI 2999 if (EnableJVMCI) { 3000 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset); 3001 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset); 3002 } 3003 #endif 3004 } 3005 3006 //------------------------------generate_handler_blob------ 3007 // 3008 // Generate a special Compile2Runtime blob that saves all registers, 3009 // and setup oopmap. 3010 // 3011 SafepointBlob* SharedRuntime::generate_handler_blob(SharedStubId id, address call_ptr) { 3012 assert(StubRoutines::forward_exception_entry() != nullptr, 3013 "must be generated before"); 3014 assert(is_polling_page_id(id), "expected a polling page stub id"); 3015 3016 ResourceMark rm; 3017 OopMapSet *oop_maps = new OopMapSet(); 3018 OopMap* map; 3019 3020 // Allocate space for the code. Setup code generation tools. 3021 const char* name = SharedRuntime::stub_name(id); 3022 CodeBuffer buffer(name, 2548, 1024); 3023 MacroAssembler* masm = new MacroAssembler(&buffer); 3024 3025 address start = __ pc(); 3026 address call_pc = nullptr; 3027 int frame_size_in_words; 3028 bool cause_return = (id == SharedStubId::polling_page_return_handler_id); 3029 bool save_wide_vectors = (id == SharedStubId::polling_page_vectors_safepoint_handler_id); 3030 3031 // Make room for return address (or push it again) 3032 if (!cause_return) { 3033 __ push(rbx); 3034 } 3035 3036 // Save registers, fpu state, and flags 3037 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors); 3038 3039 // The following is basically a call_VM. However, we need the precise 3040 // address of the call in order to generate an oopmap. Hence, we do all the 3041 // work ourselves. 3042 3043 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next: 3044 3045 // The return address must always be correct so that frame constructor never 3046 // sees an invalid pc. 3047 3048 if (!cause_return) { 3049 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack. 3050 // Additionally, rbx is a callee saved register and we can look at it later to determine 3051 // if someone changed the return address for us! 3052 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset())); 3053 __ movptr(Address(rbp, wordSize), rbx); 3054 } 3055 3056 // Do the call 3057 __ mov(c_rarg0, r15_thread); 3058 __ call(RuntimeAddress(call_ptr)); 3059 3060 // Set an oopmap for the call site. This oopmap will map all 3061 // oop-registers and debug-info registers as callee-saved. This 3062 // will allow deoptimization at this safepoint to find all possible 3063 // debug-info recordings, as well as let GC find all oops. 3064 3065 oop_maps->add_gc_map( __ pc() - start, map); 3066 3067 Label noException; 3068 3069 __ reset_last_Java_frame(false); 3070 3071 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3072 __ jcc(Assembler::equal, noException); 3073 3074 // Exception pending 3075 3076 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3077 3078 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3079 3080 // No exception case 3081 __ bind(noException); 3082 3083 Label no_adjust; 3084 #ifdef ASSERT 3085 Label bail; 3086 #endif 3087 if (!cause_return) { 3088 Label no_prefix, not_special, check_rex_prefix; 3089 3090 // If our stashed return pc was modified by the runtime we avoid touching it 3091 __ cmpptr(rbx, Address(rbp, wordSize)); 3092 __ jcc(Assembler::notEqual, no_adjust); 3093 3094 // Skip over the poll instruction. 3095 // See NativeInstruction::is_safepoint_poll() 3096 // Possible encodings: 3097 // 85 00 test %eax,(%rax) 3098 // 85 01 test %eax,(%rcx) 3099 // 85 02 test %eax,(%rdx) 3100 // 85 03 test %eax,(%rbx) 3101 // 85 06 test %eax,(%rsi) 3102 // 85 07 test %eax,(%rdi) 3103 // 3104 // 41 85 00 test %eax,(%r8) 3105 // 41 85 01 test %eax,(%r9) 3106 // 41 85 02 test %eax,(%r10) 3107 // 41 85 03 test %eax,(%r11) 3108 // 41 85 06 test %eax,(%r14) 3109 // 41 85 07 test %eax,(%r15) 3110 // 3111 // 85 04 24 test %eax,(%rsp) 3112 // 41 85 04 24 test %eax,(%r12) 3113 // 85 45 00 test %eax,0x0(%rbp) 3114 // 41 85 45 00 test %eax,0x0(%r13) 3115 // 3116 // Notes: 3117 // Format of legacy MAP0 test instruction:- 3118 // [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32] 3119 // o For safepoint polling instruction "test %eax,(%rax)", encoding of first register 3120 // operand and base register of memory operand is b/w [0-8), hence we do not require 3121 // additional REX prefix where REX.B bit stores MSB bit of register encoding, which 3122 // is why two bytes encoding is sufficient here. 3123 // o For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE 3124 // register of memory operand is 1000, thus we need additional REX prefix in this case, 3125 // there by adding additional byte to instruction encoding. 3126 // o In case BASE register is one of the 32 extended GPR registers available only on targets 3127 // supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold 3128 // most significant two bits of 5 bit register encoding. 3129 3130 if (VM_Version::supports_apx_f()) { 3131 __ cmpb(Address(rbx, 0), Assembler::REX2); 3132 __ jccb(Assembler::notEqual, check_rex_prefix); 3133 __ addptr(rbx, 2); 3134 __ bind(check_rex_prefix); 3135 } 3136 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix); 3137 __ jccb(Assembler::notEqual, no_prefix); 3138 __ addptr(rbx, 1); 3139 __ bind(no_prefix); 3140 #ifdef ASSERT 3141 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below 3142 #endif 3143 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values: 3144 // r12/rsp 0x04 3145 // r13/rbp 0x05 3146 __ movzbq(rcx, Address(rbx, 1)); 3147 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05 3148 __ subptr(rcx, 4); // looking for 0x00 .. 0x01 3149 __ cmpptr(rcx, 1); 3150 __ jccb(Assembler::above, not_special); 3151 __ addptr(rbx, 1); 3152 __ bind(not_special); 3153 #ifdef ASSERT 3154 // Verify the correct encoding of the poll we're about to skip. 3155 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl); 3156 __ jcc(Assembler::notEqual, bail); 3157 // Mask out the modrm bits 3158 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask); 3159 // rax encodes to 0, so if the bits are nonzero it's incorrect 3160 __ jcc(Assembler::notZero, bail); 3161 #endif 3162 // Adjust return pc forward to step over the safepoint poll instruction 3163 __ addptr(rbx, 2); 3164 __ movptr(Address(rbp, wordSize), rbx); 3165 } 3166 3167 __ bind(no_adjust); 3168 // Normal exit, restore registers and exit. 3169 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3170 __ ret(0); 3171 3172 #ifdef ASSERT 3173 __ bind(bail); 3174 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected"); 3175 #endif 3176 3177 // Make sure all code is generated 3178 masm->flush(); 3179 3180 // Fill-out other meta info 3181 return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words); 3182 } 3183 3184 // 3185 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss 3186 // 3187 // Generate a stub that calls into vm to find out the proper destination 3188 // of a java call. All the argument registers are live at this point 3189 // but since this is generic code we don't know what they are and the caller 3190 // must do any gc of the args. 3191 // 3192 RuntimeStub* SharedRuntime::generate_resolve_blob(SharedStubId id, address destination) { 3193 assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before"); 3194 assert(is_resolve_id(id), "expected a resolve stub id"); 3195 3196 // allocate space for the code 3197 ResourceMark rm; 3198 3199 const char* name = SharedRuntime::stub_name(id); 3200 CodeBuffer buffer(name, 1552, 512); 3201 MacroAssembler* masm = new MacroAssembler(&buffer); 3202 3203 int frame_size_in_words; 3204 3205 OopMapSet *oop_maps = new OopMapSet(); 3206 OopMap* map = nullptr; 3207 3208 int start = __ offset(); 3209 3210 // No need to save vector registers since they are caller-saved anyway. 3211 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false); 3212 3213 int frame_complete = __ offset(); 3214 3215 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 3216 3217 __ mov(c_rarg0, r15_thread); 3218 3219 __ call(RuntimeAddress(destination)); 3220 3221 3222 // Set an oopmap for the call site. 3223 // We need this not only for callee-saved registers, but also for volatile 3224 // registers that the compiler might be keeping live across a safepoint. 3225 3226 oop_maps->add_gc_map( __ offset() - start, map); 3227 3228 // rax contains the address we are going to jump to assuming no exception got installed 3229 3230 // clear last_Java_sp 3231 __ reset_last_Java_frame(false); 3232 // check for pending exceptions 3233 Label pending; 3234 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3235 __ jcc(Assembler::notEqual, pending); 3236 3237 // get the returned Method* 3238 __ get_vm_result_2(rbx, r15_thread); 3239 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx); 3240 3241 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 3242 3243 RegisterSaver::restore_live_registers(masm); 3244 3245 // We are back to the original state on entry and ready to go. 3246 3247 __ jmp(rax); 3248 3249 // Pending exception after the safepoint 3250 3251 __ bind(pending); 3252 3253 RegisterSaver::restore_live_registers(masm); 3254 3255 // exception pending => remove activation and forward to exception handler 3256 3257 __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD); 3258 3259 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 3260 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3261 3262 // ------------- 3263 // make sure all code is generated 3264 masm->flush(); 3265 3266 // return the blob 3267 // frame_size_words or bytes?? 3268 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); 3269 } 3270 3271 // Continuation point for throwing of implicit exceptions that are 3272 // not handled in the current activation. Fabricates an exception 3273 // oop and initiates normal exception dispatching in this 3274 // frame. Since we need to preserve callee-saved values (currently 3275 // only for C2, but done for C1 as well) we need a callee-saved oop 3276 // map and therefore have to make these stubs into RuntimeStubs 3277 // rather than BufferBlobs. If the compiler needs all registers to 3278 // be preserved between the fault point and the exception handler 3279 // then it must assume responsibility for that in 3280 // AbstractCompiler::continuation_for_implicit_null_exception or 3281 // continuation_for_implicit_division_by_zero_exception. All other 3282 // implicit exceptions (e.g., NullPointerException or 3283 // AbstractMethodError on entry) are either at call sites or 3284 // otherwise assume that stack unwinding will be initiated, so 3285 // caller saved registers were assumed volatile in the compiler. 3286 RuntimeStub* SharedRuntime::generate_throw_exception(SharedStubId id, address runtime_entry) { 3287 assert(is_throw_id(id), "expected a throw stub id"); 3288 3289 const char* name = SharedRuntime::stub_name(id); 3290 3291 // Information about frame layout at time of blocking runtime call. 3292 // Note that we only have to preserve callee-saved registers since 3293 // the compilers are responsible for supplying a continuation point 3294 // if they expect all registers to be preserved. 3295 enum layout { 3296 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt, 3297 rbp_off2, 3298 return_off, 3299 return_off2, 3300 framesize // inclusive of return address 3301 }; 3302 3303 int insts_size = 512; 3304 int locs_size = 64; 3305 3306 ResourceMark rm; 3307 const char* timer_msg = "SharedRuntime generate_throw_exception"; 3308 TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime)); 3309 3310 CodeBuffer code(name, insts_size, locs_size); 3311 OopMapSet* oop_maps = new OopMapSet(); 3312 MacroAssembler* masm = new MacroAssembler(&code); 3313 3314 address start = __ pc(); 3315 3316 // This is an inlined and slightly modified version of call_VM 3317 // which has the ability to fetch the return PC out of 3318 // thread-local storage and also sets up last_Java_sp slightly 3319 // differently than the real call_VM 3320 3321 __ enter(); // required for proper stackwalking of RuntimeStub frame 3322 3323 assert(is_even(framesize/2), "sp not 16-byte aligned"); 3324 3325 // return address and rbp are already in place 3326 __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog 3327 3328 int frame_complete = __ pc() - start; 3329 3330 // Set up last_Java_sp and last_Java_fp 3331 address the_pc = __ pc(); 3332 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 3333 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack 3334 3335 // Call runtime 3336 __ movptr(c_rarg0, r15_thread); 3337 BLOCK_COMMENT("call runtime_entry"); 3338 __ call(RuntimeAddress(runtime_entry)); 3339 3340 // Generate oop map 3341 OopMap* map = new OopMap(framesize, 0); 3342 3343 oop_maps->add_gc_map(the_pc - start, map); 3344 3345 __ reset_last_Java_frame(true); 3346 3347 __ leave(); // required for proper stackwalking of RuntimeStub frame 3348 3349 // check for pending exceptions 3350 #ifdef ASSERT 3351 Label L; 3352 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3353 __ jcc(Assembler::notEqual, L); 3354 __ should_not_reach_here(); 3355 __ bind(L); 3356 #endif // ASSERT 3357 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3358 3359 3360 // codeBlob framesize is in words (not VMRegImpl::slot_size) 3361 RuntimeStub* stub = 3362 RuntimeStub::new_runtime_stub(name, 3363 &code, 3364 frame_complete, 3365 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3366 oop_maps, false); 3367 return stub; 3368 } 3369 3370 //------------------------------Montgomery multiplication------------------------ 3371 // 3372 3373 #ifndef _WINDOWS 3374 3375 // Subtract 0:b from carry:a. Return carry. 3376 static julong 3377 sub(julong a[], julong b[], julong carry, long len) { 3378 long long i = 0, cnt = len; 3379 julong tmp; 3380 asm volatile("clc; " 3381 "0: ; " 3382 "mov (%[b], %[i], 8), %[tmp]; " 3383 "sbb %[tmp], (%[a], %[i], 8); " 3384 "inc %[i]; dec %[cnt]; " 3385 "jne 0b; " 3386 "mov %[carry], %[tmp]; sbb $0, %[tmp]; " 3387 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp) 3388 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry) 3389 : "memory"); 3390 return tmp; 3391 } 3392 3393 // Multiply (unsigned) Long A by Long B, accumulating the double- 3394 // length result into the accumulator formed of T0, T1, and T2. 3395 #define MACC(A, B, T0, T1, T2) \ 3396 do { \ 3397 unsigned long hi, lo; \ 3398 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3399 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3400 : "r"(A), "a"(B) : "cc"); \ 3401 } while(0) 3402 3403 // As above, but add twice the double-length result into the 3404 // accumulator. 3405 #define MACC2(A, B, T0, T1, T2) \ 3406 do { \ 3407 unsigned long hi, lo; \ 3408 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \ 3409 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3410 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3411 : "r"(A), "a"(B) : "cc"); \ 3412 } while(0) 3413 3414 #else //_WINDOWS 3415 3416 static julong 3417 sub(julong a[], julong b[], julong carry, long len) { 3418 long i; 3419 julong tmp; 3420 unsigned char c = 1; 3421 for (i = 0; i < len; i++) { 3422 c = _addcarry_u64(c, a[i], ~b[i], &tmp); 3423 a[i] = tmp; 3424 } 3425 c = _addcarry_u64(c, carry, ~0, &tmp); 3426 return tmp; 3427 } 3428 3429 // Multiply (unsigned) Long A by Long B, accumulating the double- 3430 // length result into the accumulator formed of T0, T1, and T2. 3431 #define MACC(A, B, T0, T1, T2) \ 3432 do { \ 3433 julong hi, lo; \ 3434 lo = _umul128(A, B, &hi); \ 3435 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3436 c = _addcarry_u64(c, hi, T1, &T1); \ 3437 _addcarry_u64(c, T2, 0, &T2); \ 3438 } while(0) 3439 3440 // As above, but add twice the double-length result into the 3441 // accumulator. 3442 #define MACC2(A, B, T0, T1, T2) \ 3443 do { \ 3444 julong hi, lo; \ 3445 lo = _umul128(A, B, &hi); \ 3446 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3447 c = _addcarry_u64(c, hi, T1, &T1); \ 3448 _addcarry_u64(c, T2, 0, &T2); \ 3449 c = _addcarry_u64(0, lo, T0, &T0); \ 3450 c = _addcarry_u64(c, hi, T1, &T1); \ 3451 _addcarry_u64(c, T2, 0, &T2); \ 3452 } while(0) 3453 3454 #endif //_WINDOWS 3455 3456 // Fast Montgomery multiplication. The derivation of the algorithm is 3457 // in A Cryptographic Library for the Motorola DSP56000, 3458 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3459 3460 static void NOINLINE 3461 montgomery_multiply(julong a[], julong b[], julong n[], 3462 julong m[], julong inv, int len) { 3463 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3464 int i; 3465 3466 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply"); 3467 3468 for (i = 0; i < len; i++) { 3469 int j; 3470 for (j = 0; j < i; j++) { 3471 MACC(a[j], b[i-j], t0, t1, t2); 3472 MACC(m[j], n[i-j], t0, t1, t2); 3473 } 3474 MACC(a[i], b[0], t0, t1, t2); 3475 m[i] = t0 * inv; 3476 MACC(m[i], n[0], t0, t1, t2); 3477 3478 assert(t0 == 0, "broken Montgomery multiply"); 3479 3480 t0 = t1; t1 = t2; t2 = 0; 3481 } 3482 3483 for (i = len; i < 2*len; i++) { 3484 int j; 3485 for (j = i-len+1; j < len; j++) { 3486 MACC(a[j], b[i-j], t0, t1, t2); 3487 MACC(m[j], n[i-j], t0, t1, t2); 3488 } 3489 m[i-len] = t0; 3490 t0 = t1; t1 = t2; t2 = 0; 3491 } 3492 3493 while (t0) 3494 t0 = sub(m, n, t0, len); 3495 } 3496 3497 // Fast Montgomery squaring. This uses asymptotically 25% fewer 3498 // multiplies so it should be up to 25% faster than Montgomery 3499 // multiplication. However, its loop control is more complex and it 3500 // may actually run slower on some machines. 3501 3502 static void NOINLINE 3503 montgomery_square(julong a[], julong n[], 3504 julong m[], julong inv, int len) { 3505 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3506 int i; 3507 3508 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square"); 3509 3510 for (i = 0; i < len; i++) { 3511 int j; 3512 int end = (i+1)/2; 3513 for (j = 0; j < end; j++) { 3514 MACC2(a[j], a[i-j], t0, t1, t2); 3515 MACC(m[j], n[i-j], t0, t1, t2); 3516 } 3517 if ((i & 1) == 0) { 3518 MACC(a[j], a[j], t0, t1, t2); 3519 } 3520 for (; j < i; j++) { 3521 MACC(m[j], n[i-j], t0, t1, t2); 3522 } 3523 m[i] = t0 * inv; 3524 MACC(m[i], n[0], t0, t1, t2); 3525 3526 assert(t0 == 0, "broken Montgomery square"); 3527 3528 t0 = t1; t1 = t2; t2 = 0; 3529 } 3530 3531 for (i = len; i < 2*len; i++) { 3532 int start = i-len+1; 3533 int end = start + (len - start)/2; 3534 int j; 3535 for (j = start; j < end; j++) { 3536 MACC2(a[j], a[i-j], t0, t1, t2); 3537 MACC(m[j], n[i-j], t0, t1, t2); 3538 } 3539 if ((i & 1) == 0) { 3540 MACC(a[j], a[j], t0, t1, t2); 3541 } 3542 for (; j < len; j++) { 3543 MACC(m[j], n[i-j], t0, t1, t2); 3544 } 3545 m[i-len] = t0; 3546 t0 = t1; t1 = t2; t2 = 0; 3547 } 3548 3549 while (t0) 3550 t0 = sub(m, n, t0, len); 3551 } 3552 3553 // Swap words in a longword. 3554 static julong swap(julong x) { 3555 return (x << 32) | (x >> 32); 3556 } 3557 3558 // Copy len longwords from s to d, word-swapping as we go. The 3559 // destination array is reversed. 3560 static void reverse_words(julong *s, julong *d, int len) { 3561 d += len; 3562 while(len-- > 0) { 3563 d--; 3564 *d = swap(*s); 3565 s++; 3566 } 3567 } 3568 3569 // The threshold at which squaring is advantageous was determined 3570 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. 3571 #define MONTGOMERY_SQUARING_THRESHOLD 64 3572 3573 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, 3574 jint len, jlong inv, 3575 jint *m_ints) { 3576 assert(len % 2 == 0, "array length in montgomery_multiply must be even"); 3577 int longwords = len/2; 3578 3579 // Make very sure we don't use so much space that the stack might 3580 // overflow. 512 jints corresponds to an 16384-bit integer and 3581 // will use here a total of 8k bytes of stack space. 3582 int divisor = sizeof(julong) * 4; 3583 guarantee(longwords <= 8192 / divisor, "must be"); 3584 int total_allocation = longwords * sizeof (julong) * 4; 3585 julong *scratch = (julong *)alloca(total_allocation); 3586 3587 // Local scratch arrays 3588 julong 3589 *a = scratch + 0 * longwords, 3590 *b = scratch + 1 * longwords, 3591 *n = scratch + 2 * longwords, 3592 *m = scratch + 3 * longwords; 3593 3594 reverse_words((julong *)a_ints, a, longwords); 3595 reverse_words((julong *)b_ints, b, longwords); 3596 reverse_words((julong *)n_ints, n, longwords); 3597 3598 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords); 3599 3600 reverse_words(m, (julong *)m_ints, longwords); 3601 } 3602 3603 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, 3604 jint len, jlong inv, 3605 jint *m_ints) { 3606 assert(len % 2 == 0, "array length in montgomery_square must be even"); 3607 int longwords = len/2; 3608 3609 // Make very sure we don't use so much space that the stack might 3610 // overflow. 512 jints corresponds to an 16384-bit integer and 3611 // will use here a total of 6k bytes of stack space. 3612 int divisor = sizeof(julong) * 3; 3613 guarantee(longwords <= (8192 / divisor), "must be"); 3614 int total_allocation = longwords * sizeof (julong) * 3; 3615 julong *scratch = (julong *)alloca(total_allocation); 3616 3617 // Local scratch arrays 3618 julong 3619 *a = scratch + 0 * longwords, 3620 *n = scratch + 1 * longwords, 3621 *m = scratch + 2 * longwords; 3622 3623 reverse_words((julong *)a_ints, a, longwords); 3624 reverse_words((julong *)n_ints, n, longwords); 3625 3626 if (len >= MONTGOMERY_SQUARING_THRESHOLD) { 3627 ::montgomery_square(a, n, m, (julong)inv, longwords); 3628 } else { 3629 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords); 3630 } 3631 3632 reverse_words(m, (julong *)m_ints, longwords); 3633 } 3634 3635 #if INCLUDE_JFR 3636 3637 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint. 3638 // It returns a jobject handle to the event writer. 3639 // The handle is dereferenced and the return value is the event writer oop. 3640 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() { 3641 enum layout { 3642 rbp_off, 3643 rbpH_off, 3644 return_off, 3645 return_off2, 3646 framesize // inclusive of return address 3647 }; 3648 3649 const char* name = SharedRuntime::stub_name(SharedStubId::jfr_write_checkpoint_id); 3650 CodeBuffer code(name, 1024, 64); 3651 MacroAssembler* masm = new MacroAssembler(&code); 3652 address start = __ pc(); 3653 3654 __ enter(); 3655 address the_pc = __ pc(); 3656 3657 int frame_complete = the_pc - start; 3658 3659 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 3660 __ movptr(c_rarg0, r15_thread); 3661 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1); 3662 __ reset_last_Java_frame(true); 3663 3664 // rax is jobject handle result, unpack and process it through a barrier. 3665 __ resolve_global_jobject(rax, r15_thread, c_rarg0); 3666 3667 __ leave(); 3668 __ ret(0); 3669 3670 OopMapSet* oop_maps = new OopMapSet(); 3671 OopMap* map = new OopMap(framesize, 1); 3672 oop_maps->add_gc_map(frame_complete, map); 3673 3674 RuntimeStub* stub = 3675 RuntimeStub::new_runtime_stub(name, 3676 &code, 3677 frame_complete, 3678 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3679 oop_maps, 3680 false); 3681 return stub; 3682 } 3683 3684 // For c2: call to return a leased buffer. 3685 RuntimeStub* SharedRuntime::generate_jfr_return_lease() { 3686 enum layout { 3687 rbp_off, 3688 rbpH_off, 3689 return_off, 3690 return_off2, 3691 framesize // inclusive of return address 3692 }; 3693 3694 const char* name = SharedRuntime::stub_name(SharedStubId::jfr_return_lease_id); 3695 CodeBuffer code(name, 1024, 64); 3696 MacroAssembler* masm = new MacroAssembler(&code); 3697 address start = __ pc(); 3698 3699 __ enter(); 3700 address the_pc = __ pc(); 3701 3702 int frame_complete = the_pc - start; 3703 3704 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2); 3705 __ movptr(c_rarg0, r15_thread); 3706 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1); 3707 __ reset_last_Java_frame(true); 3708 3709 __ leave(); 3710 __ ret(0); 3711 3712 OopMapSet* oop_maps = new OopMapSet(); 3713 OopMap* map = new OopMap(framesize, 1); 3714 oop_maps->add_gc_map(frame_complete, map); 3715 3716 RuntimeStub* stub = 3717 RuntimeStub::new_runtime_stub(name, 3718 &code, 3719 frame_complete, 3720 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3721 oop_maps, 3722 false); 3723 return stub; 3724 } 3725 3726 #endif // INCLUDE_JFR 3727