1 /* 2 * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #ifndef _WINDOWS 27 #include "alloca.h" 28 #endif 29 #include "asm/macroAssembler.hpp" 30 #include "asm/macroAssembler.inline.hpp" 31 #include "code/compiledIC.hpp" 32 #include "code/debugInfoRec.hpp" 33 #include "code/nativeInst.hpp" 34 #include "code/vtableStubs.hpp" 35 #include "compiler/oopMap.hpp" 36 #include "gc/shared/collectedHeap.hpp" 37 #include "gc/shared/gcLocker.hpp" 38 #include "gc/shared/barrierSet.hpp" 39 #include "gc/shared/barrierSetAssembler.hpp" 40 #include "interpreter/interpreter.hpp" 41 #include "logging/log.hpp" 42 #include "memory/resourceArea.hpp" 43 #include "memory/universe.hpp" 44 #include "oops/klass.inline.hpp" 45 #include "oops/method.inline.hpp" 46 #include "prims/methodHandles.hpp" 47 #include "runtime/continuation.hpp" 48 #include "runtime/continuationEntry.inline.hpp" 49 #include "runtime/globals.hpp" 50 #include "runtime/jniHandles.hpp" 51 #include "runtime/safepointMechanism.hpp" 52 #include "runtime/sharedRuntime.hpp" 53 #include "runtime/signature.hpp" 54 #include "runtime/stubRoutines.hpp" 55 #include "runtime/vframeArray.hpp" 56 #include "runtime/vm_version.hpp" 57 #include "utilities/align.hpp" 58 #include "utilities/checkedCast.hpp" 59 #include "utilities/formatBuffer.hpp" 60 #include "vmreg_x86.inline.hpp" 61 #ifdef COMPILER1 62 #include "c1/c1_Runtime1.hpp" 63 #endif 64 #ifdef COMPILER2 65 #include "opto/runtime.hpp" 66 #endif 67 #if INCLUDE_JVMCI 68 #include "jvmci/jvmciJavaClasses.hpp" 69 #endif 70 71 #define __ masm-> 72 73 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size; 74 75 class SimpleRuntimeFrame { 76 77 public: 78 79 // Most of the runtime stubs have this simple frame layout. 80 // This class exists to make the layout shared in one place. 81 // Offsets are for compiler stack slots, which are jints. 82 enum layout { 83 // The frame sender code expects that rbp will be in the "natural" place and 84 // will override any oopMap setting for it. We must therefore force the layout 85 // so that it agrees with the frame sender code. 86 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt, 87 rbp_off2, 88 return_off, return_off2, 89 framesize 90 }; 91 }; 92 93 class RegisterSaver { 94 // Capture info about frame layout. Layout offsets are in jint 95 // units because compiler frame slots are jints. 96 #define XSAVE_AREA_BEGIN 160 97 #define XSAVE_AREA_YMM_BEGIN 576 98 #define XSAVE_AREA_EGPRS 960 99 #define XSAVE_AREA_OPMASK_BEGIN 1088 100 #define XSAVE_AREA_ZMM_BEGIN 1152 101 #define XSAVE_AREA_UPPERBANK 1664 102 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off 103 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off 104 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off 105 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off 106 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off 107 enum layout { 108 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area 109 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area 110 DEF_XMM_OFFS(0), 111 DEF_XMM_OFFS(1), 112 // 2..15 are implied in range usage 113 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 114 DEF_YMM_OFFS(0), 115 DEF_YMM_OFFS(1), 116 // 2..15 are implied in range usage 117 r31_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt, 118 r31H_off, 119 r30_off, r30H_off, 120 r29_off, r29H_off, 121 r28_off, r28H_off, 122 r27_off, r27H_off, 123 r26_off, r26H_off, 124 r25_off, r25H_off, 125 r24_off, r24H_off, 126 r23_off, r23H_off, 127 r22_off, r22H_off, 128 r21_off, r21H_off, 129 r20_off, r20H_off, 130 r19_off, r19H_off, 131 r18_off, r18H_off, 132 r17_off, r17H_off, 133 r16_off, r16H_off, 134 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 135 DEF_OPMASK_OFFS(0), 136 DEF_OPMASK_OFFS(1), 137 // 2..7 are implied in range usage 138 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 139 DEF_ZMM_OFFS(0), 140 DEF_ZMM_OFFS(1), 141 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt, 142 DEF_ZMM_UPPER_OFFS(16), 143 DEF_ZMM_UPPER_OFFS(17), 144 // 18..31 are implied in range usage 145 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt), 146 fpu_stateH_end, 147 r15_off, r15H_off, 148 r14_off, r14H_off, 149 r13_off, r13H_off, 150 r12_off, r12H_off, 151 r11_off, r11H_off, 152 r10_off, r10H_off, 153 r9_off, r9H_off, 154 r8_off, r8H_off, 155 rdi_off, rdiH_off, 156 rsi_off, rsiH_off, 157 ignore_off, ignoreH_off, // extra copy of rbp 158 rsp_off, rspH_off, 159 rbx_off, rbxH_off, 160 rdx_off, rdxH_off, 161 rcx_off, rcxH_off, 162 rax_off, raxH_off, 163 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state 164 align_off, alignH_off, 165 flags_off, flagsH_off, 166 // The frame sender code expects that rbp will be in the "natural" place and 167 // will override any oopMap setting for it. We must therefore force the layout 168 // so that it agrees with the frame sender code. 169 rbp_off, rbpH_off, // copy of rbp we will restore 170 return_off, returnH_off, // slot for return address 171 reg_save_size // size in compiler stack slots 172 }; 173 174 public: 175 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors); 176 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false); 177 178 // Offsets into the register save area 179 // Used by deoptimization when it is managing result register 180 // values on its own 181 182 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; } 183 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; } 184 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; } 185 static int r15_offset_in_bytes(void) { return BytesPerInt * r15_off; } 186 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; } 187 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; } 188 189 // During deoptimization only the result registers need to be restored, 190 // all the other values have already been extracted. 191 static void restore_result_registers(MacroAssembler* masm); 192 }; 193 194 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) { 195 int off = 0; 196 int num_xmm_regs = XMMRegister::available_xmm_registers(); 197 #if COMPILER2_OR_JVMCI 198 if (save_wide_vectors && UseAVX == 0) { 199 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX 200 } 201 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 202 #else 203 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI 204 #endif 205 206 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated 207 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs); 208 // OopMap frame size is in compiler stack slots (jint's) not bytes or words 209 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 210 // CodeBlob frame size is in words. 211 int frame_size_in_words = frame_size_in_bytes / wordSize; 212 *total_frame_words = frame_size_in_words; 213 214 // Save registers, fpu state, and flags. 215 // We assume caller has already pushed the return address onto the 216 // stack, so rsp is 8-byte aligned here. 217 // We push rpb twice in this sequence because we want the real rbp 218 // to be under the return like a normal enter. 219 220 __ enter(); // rsp becomes 16-byte aligned here 221 __ pushf(); 222 // Make sure rsp stays 16-byte aligned 223 __ subq(rsp, 8); 224 // Push CPU state in multiple of 16 bytes 225 __ save_legacy_gprs(); 226 __ push_FPU_state(); 227 228 229 // push cpu state handles this on EVEX enabled targets 230 if (save_wide_vectors) { 231 // Save upper half of YMM registers(0..15) 232 int base_addr = XSAVE_AREA_YMM_BEGIN; 233 for (int n = 0; n < 16; n++) { 234 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n)); 235 } 236 if (VM_Version::supports_evex()) { 237 // Save upper half of ZMM registers(0..15) 238 base_addr = XSAVE_AREA_ZMM_BEGIN; 239 for (int n = 0; n < 16; n++) { 240 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n)); 241 } 242 // Save full ZMM registers(16..num_xmm_regs) 243 base_addr = XSAVE_AREA_UPPERBANK; 244 off = 0; 245 int vector_len = Assembler::AVX_512bit; 246 for (int n = 16; n < num_xmm_regs; n++) { 247 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 248 } 249 #if COMPILER2_OR_JVMCI 250 base_addr = XSAVE_AREA_OPMASK_BEGIN; 251 off = 0; 252 for(int n = 0; n < KRegister::number_of_registers; n++) { 253 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 254 } 255 #endif 256 } 257 } else { 258 if (VM_Version::supports_evex()) { 259 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 260 int base_addr = XSAVE_AREA_UPPERBANK; 261 off = 0; 262 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 263 for (int n = 16; n < num_xmm_regs; n++) { 264 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 265 } 266 #if COMPILER2_OR_JVMCI 267 base_addr = XSAVE_AREA_OPMASK_BEGIN; 268 off = 0; 269 for(int n = 0; n < KRegister::number_of_registers; n++) { 270 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 271 } 272 #endif 273 } 274 } 275 276 #if COMPILER2_OR_JVMCI 277 if (UseAPX) { 278 int base_addr = XSAVE_AREA_EGPRS; 279 off = 0; 280 for(int n = 16; n < Register::number_of_registers; n++) { 281 __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n)); 282 } 283 } 284 #endif 285 286 __ vzeroupper(); 287 if (frame::arg_reg_save_area_bytes != 0) { 288 // Allocate argument register save area 289 __ subptr(rsp, frame::arg_reg_save_area_bytes); 290 } 291 292 // Set an oopmap for the call site. This oopmap will map all 293 // oop-registers and debug-info registers as callee-saved. This 294 // will allow deoptimization at this safepoint to find all possible 295 // debug-info recordings, as well as let GC find all oops. 296 297 OopMapSet *oop_maps = new OopMapSet(); 298 OopMap* map = new OopMap(frame_size_in_slots, 0); 299 300 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x)) 301 302 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg()); 303 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg()); 304 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg()); 305 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg()); 306 // rbp location is known implicitly by the frame sender code, needs no oopmap 307 // and the location where rbp was saved by is ignored 308 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg()); 309 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg()); 310 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg()); 311 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg()); 312 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg()); 313 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg()); 314 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg()); 315 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg()); 316 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg()); 317 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg()); 318 319 if (UseAPX) { 320 map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg()); 321 map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg()); 322 map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg()); 323 map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg()); 324 map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg()); 325 map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg()); 326 map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg()); 327 map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg()); 328 map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg()); 329 map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg()); 330 map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg()); 331 map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg()); 332 map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg()); 333 map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg()); 334 map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg()); 335 map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg()); 336 } 337 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 338 // on EVEX enabled targets, we get it included in the xsave area 339 off = xmm0_off; 340 int delta = xmm1_off - off; 341 for (int n = 0; n < 16; n++) { 342 XMMRegister xmm_name = as_XMMRegister(n); 343 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()); 344 off += delta; 345 } 346 if (UseAVX > 2) { 347 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 348 off = zmm16_off; 349 delta = zmm17_off - off; 350 for (int n = 16; n < num_xmm_regs; n++) { 351 XMMRegister zmm_name = as_XMMRegister(n); 352 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()); 353 off += delta; 354 } 355 } 356 357 #if COMPILER2_OR_JVMCI 358 if (save_wide_vectors) { 359 // Save upper half of YMM registers(0..15) 360 off = ymm0_off; 361 delta = ymm1_off - ymm0_off; 362 for (int n = 0; n < 16; n++) { 363 XMMRegister ymm_name = as_XMMRegister(n); 364 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4)); 365 off += delta; 366 } 367 if (VM_Version::supports_evex()) { 368 // Save upper half of ZMM registers(0..15) 369 off = zmm0_off; 370 delta = zmm1_off - zmm0_off; 371 for (int n = 0; n < 16; n++) { 372 XMMRegister zmm_name = as_XMMRegister(n); 373 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8)); 374 off += delta; 375 } 376 } 377 } 378 #endif // COMPILER2_OR_JVMCI 379 380 // %%% These should all be a waste but we'll keep things as they were for now 381 if (true) { 382 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next()); 383 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next()); 384 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next()); 385 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next()); 386 // rbp location is known implicitly by the frame sender code, needs no oopmap 387 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next()); 388 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next()); 389 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next()); 390 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next()); 391 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next()); 392 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next()); 393 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next()); 394 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next()); 395 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next()); 396 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next()); 397 if (UseAPX) { 398 map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next()); 399 map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next()); 400 map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next()); 401 map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next()); 402 map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next()); 403 map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next()); 404 map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next()); 405 map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next()); 406 map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next()); 407 map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next()); 408 map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next()); 409 map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next()); 410 map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next()); 411 map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next()); 412 map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next()); 413 map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next()); 414 } 415 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 416 // on EVEX enabled targets, we get it included in the xsave area 417 off = xmm0H_off; 418 delta = xmm1H_off - off; 419 for (int n = 0; n < 16; n++) { 420 XMMRegister xmm_name = as_XMMRegister(n); 421 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next()); 422 off += delta; 423 } 424 if (UseAVX > 2) { 425 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 426 off = zmm16H_off; 427 delta = zmm17H_off - off; 428 for (int n = 16; n < num_xmm_regs; n++) { 429 XMMRegister zmm_name = as_XMMRegister(n); 430 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next()); 431 off += delta; 432 } 433 } 434 } 435 436 return map; 437 } 438 439 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) { 440 int num_xmm_regs = XMMRegister::available_xmm_registers(); 441 if (frame::arg_reg_save_area_bytes != 0) { 442 // Pop arg register save area 443 __ addptr(rsp, frame::arg_reg_save_area_bytes); 444 } 445 446 #if COMPILER2_OR_JVMCI 447 if (restore_wide_vectors) { 448 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); 449 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 450 } 451 #else 452 assert(!restore_wide_vectors, "vectors are generated only by C2"); 453 #endif 454 455 __ vzeroupper(); 456 457 // On EVEX enabled targets everything is handled in pop fpu state 458 if (restore_wide_vectors) { 459 // Restore upper half of YMM registers (0..15) 460 int base_addr = XSAVE_AREA_YMM_BEGIN; 461 for (int n = 0; n < 16; n++) { 462 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16)); 463 } 464 if (VM_Version::supports_evex()) { 465 // Restore upper half of ZMM registers (0..15) 466 base_addr = XSAVE_AREA_ZMM_BEGIN; 467 for (int n = 0; n < 16; n++) { 468 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32)); 469 } 470 // Restore full ZMM registers(16..num_xmm_regs) 471 base_addr = XSAVE_AREA_UPPERBANK; 472 int vector_len = Assembler::AVX_512bit; 473 int off = 0; 474 for (int n = 16; n < num_xmm_regs; n++) { 475 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 476 } 477 #if COMPILER2_OR_JVMCI 478 base_addr = XSAVE_AREA_OPMASK_BEGIN; 479 off = 0; 480 for (int n = 0; n < KRegister::number_of_registers; n++) { 481 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 482 } 483 #endif 484 } 485 } else { 486 if (VM_Version::supports_evex()) { 487 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 488 int base_addr = XSAVE_AREA_UPPERBANK; 489 int off = 0; 490 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 491 for (int n = 16; n < num_xmm_regs; n++) { 492 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 493 } 494 #if COMPILER2_OR_JVMCI 495 base_addr = XSAVE_AREA_OPMASK_BEGIN; 496 off = 0; 497 for (int n = 0; n < KRegister::number_of_registers; n++) { 498 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 499 } 500 #endif 501 } 502 } 503 504 #if COMPILER2_OR_JVMCI 505 if (UseAPX) { 506 int base_addr = XSAVE_AREA_EGPRS; 507 int off = 0; 508 for (int n = 16; n < Register::number_of_registers; n++) { 509 __ movq(as_Register(n), Address(rsp, base_addr+(off++*8))); 510 } 511 } 512 #endif 513 514 // Recover CPU state 515 __ pop_FPU_state(); 516 __ restore_legacy_gprs(); 517 __ addq(rsp, 8); 518 __ popf(); 519 // Get the rbp described implicitly by the calling convention (no oopMap) 520 __ pop(rbp); 521 } 522 523 void RegisterSaver::restore_result_registers(MacroAssembler* masm) { 524 525 // Just restore result register. Only used by deoptimization. By 526 // now any callee save register that needs to be restored to a c2 527 // caller of the deoptee has been extracted into the vframeArray 528 // and will be stuffed into the c2i adapter we create for later 529 // restoration so only result registers need to be restored here. 530 531 // Restore fp result register 532 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes())); 533 // Restore integer result register 534 __ movptr(rax, Address(rsp, rax_offset_in_bytes())); 535 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes())); 536 537 // Pop all of the register save are off the stack except the return address 538 __ addptr(rsp, return_offset_in_bytes()); 539 } 540 541 // Is vector's size (in bytes) bigger than a size saved by default? 542 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions. 543 bool SharedRuntime::is_wide_vector(int size) { 544 return size > 16; 545 } 546 547 // --------------------------------------------------------------------------- 548 // Read the array of BasicTypes from a signature, and compute where the 549 // arguments should go. Values in the VMRegPair regs array refer to 4-byte 550 // quantities. Values less than VMRegImpl::stack0 are registers, those above 551 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer 552 // as framesizes are fixed. 553 // VMRegImpl::stack0 refers to the first slot 0(sp). 554 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher. 555 // Register up to Register::number_of_registers are the 64-bit 556 // integer registers. 557 558 // Note: the INPUTS in sig_bt are in units of Java argument words, which are 559 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit 560 // units regardless of build. Of course for i486 there is no 64 bit build 561 562 // The Java calling convention is a "shifted" version of the C ABI. 563 // By skipping the first C ABI register we can call non-static jni methods 564 // with small numbers of arguments without having to shuffle the arguments 565 // at all. Since we control the java ABI we ought to at least get some 566 // advantage out of it. 567 568 int SharedRuntime::java_calling_convention(const BasicType *sig_bt, 569 VMRegPair *regs, 570 int total_args_passed) { 571 572 // Create the mapping between argument positions and 573 // registers. 574 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = { 575 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5 576 }; 577 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = { 578 j_farg0, j_farg1, j_farg2, j_farg3, 579 j_farg4, j_farg5, j_farg6, j_farg7 580 }; 581 582 583 uint int_args = 0; 584 uint fp_args = 0; 585 uint stk_args = 0; 586 587 for (int i = 0; i < total_args_passed; i++) { 588 switch (sig_bt[i]) { 589 case T_BOOLEAN: 590 case T_CHAR: 591 case T_BYTE: 592 case T_SHORT: 593 case T_INT: 594 if (int_args < Argument::n_int_register_parameters_j) { 595 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 596 } else { 597 stk_args = align_up(stk_args, 2); 598 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 599 stk_args += 1; 600 } 601 break; 602 case T_VOID: 603 // halves of T_LONG or T_DOUBLE 604 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 605 regs[i].set_bad(); 606 break; 607 case T_LONG: 608 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 609 // fall through 610 case T_OBJECT: 611 case T_ARRAY: 612 case T_ADDRESS: 613 if (int_args < Argument::n_int_register_parameters_j) { 614 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 615 } else { 616 stk_args = align_up(stk_args, 2); 617 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 618 stk_args += 2; 619 } 620 break; 621 case T_FLOAT: 622 if (fp_args < Argument::n_float_register_parameters_j) { 623 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 624 } else { 625 stk_args = align_up(stk_args, 2); 626 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 627 stk_args += 1; 628 } 629 break; 630 case T_DOUBLE: 631 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 632 if (fp_args < Argument::n_float_register_parameters_j) { 633 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 634 } else { 635 stk_args = align_up(stk_args, 2); 636 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 637 stk_args += 2; 638 } 639 break; 640 default: 641 ShouldNotReachHere(); 642 break; 643 } 644 } 645 646 return stk_args; 647 } 648 649 // Patch the callers callsite with entry to compiled code if it exists. 650 static void patch_callers_callsite(MacroAssembler *masm) { 651 Label L; 652 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 653 __ jcc(Assembler::equal, L); 654 655 // Save the current stack pointer 656 __ mov(r13, rsp); 657 // Schedule the branch target address early. 658 // Call into the VM to patch the caller, then jump to compiled callee 659 // rax isn't live so capture return address while we easily can 660 __ movptr(rax, Address(rsp, 0)); 661 662 // align stack so push_CPU_state doesn't fault 663 __ andptr(rsp, -(StackAlignmentInBytes)); 664 __ push_CPU_state(); 665 __ vzeroupper(); 666 // VM needs caller's callsite 667 // VM needs target method 668 // This needs to be a long call since we will relocate this adapter to 669 // the codeBuffer and it may not reach 670 671 // Allocate argument register save area 672 if (frame::arg_reg_save_area_bytes != 0) { 673 __ subptr(rsp, frame::arg_reg_save_area_bytes); 674 } 675 __ mov(c_rarg0, rbx); 676 __ mov(c_rarg1, rax); 677 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite))); 678 679 // De-allocate argument register save area 680 if (frame::arg_reg_save_area_bytes != 0) { 681 __ addptr(rsp, frame::arg_reg_save_area_bytes); 682 } 683 684 __ vzeroupper(); 685 __ pop_CPU_state(); 686 // restore sp 687 __ mov(rsp, r13); 688 __ bind(L); 689 } 690 691 692 static void gen_c2i_adapter(MacroAssembler *masm, 693 int total_args_passed, 694 int comp_args_on_stack, 695 const BasicType *sig_bt, 696 const VMRegPair *regs, 697 Label& skip_fixup) { 698 // Before we get into the guts of the C2I adapter, see if we should be here 699 // at all. We've come from compiled code and are attempting to jump to the 700 // interpreter, which means the caller made a static call to get here 701 // (vcalls always get a compiled target if there is one). Check for a 702 // compiled target. If there is one, we need to patch the caller's call. 703 patch_callers_callsite(masm); 704 705 __ bind(skip_fixup); 706 707 // Since all args are passed on the stack, total_args_passed * 708 // Interpreter::stackElementSize is the space we need. 709 710 assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed); 711 712 int extraspace = (total_args_passed * Interpreter::stackElementSize); 713 714 // stack is aligned, keep it that way 715 // This is not currently needed or enforced by the interpreter, but 716 // we might as well conform to the ABI. 717 extraspace = align_up(extraspace, 2*wordSize); 718 719 // set senderSP value 720 __ lea(r13, Address(rsp, wordSize)); 721 722 #ifdef ASSERT 723 __ check_stack_alignment(r13, "sender stack not aligned"); 724 #endif 725 if (extraspace > 0) { 726 // Pop the return address 727 __ pop(rax); 728 729 __ subptr(rsp, extraspace); 730 731 // Push the return address 732 __ push(rax); 733 734 // Account for the return address location since we store it first rather 735 // than hold it in a register across all the shuffling 736 extraspace += wordSize; 737 } 738 739 #ifdef ASSERT 740 __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax); 741 #endif 742 743 // Now write the args into the outgoing interpreter space 744 for (int i = 0; i < total_args_passed; i++) { 745 if (sig_bt[i] == T_VOID) { 746 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 747 continue; 748 } 749 750 // offset to start parameters 751 int st_off = (total_args_passed - i) * Interpreter::stackElementSize; 752 int next_off = st_off - Interpreter::stackElementSize; 753 754 // Say 4 args: 755 // i st_off 756 // 0 32 T_LONG 757 // 1 24 T_VOID 758 // 2 16 T_OBJECT 759 // 3 8 T_BOOL 760 // - 0 return address 761 // 762 // However to make thing extra confusing. Because we can fit a long/double in 763 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter 764 // leaves one slot empty and only stores to a single slot. In this case the 765 // slot that is occupied is the T_VOID slot. See I said it was confusing. 766 767 VMReg r_1 = regs[i].first(); 768 VMReg r_2 = regs[i].second(); 769 if (!r_1->is_valid()) { 770 assert(!r_2->is_valid(), ""); 771 continue; 772 } 773 if (r_1->is_stack()) { 774 // memory to memory use rax 775 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace; 776 if (!r_2->is_valid()) { 777 // sign extend?? 778 __ movl(rax, Address(rsp, ld_off)); 779 __ movptr(Address(rsp, st_off), rax); 780 781 } else { 782 783 __ movq(rax, Address(rsp, ld_off)); 784 785 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 786 // T_DOUBLE and T_LONG use two slots in the interpreter 787 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 788 // ld_off == LSW, ld_off+wordSize == MSW 789 // st_off == MSW, next_off == LSW 790 __ movq(Address(rsp, next_off), rax); 791 #ifdef ASSERT 792 // Overwrite the unused slot with known junk 793 __ mov64(rax, CONST64(0xdeadffffdeadaaaa)); 794 __ movptr(Address(rsp, st_off), rax); 795 #endif /* ASSERT */ 796 } else { 797 __ movq(Address(rsp, st_off), rax); 798 } 799 } 800 } else if (r_1->is_Register()) { 801 Register r = r_1->as_Register(); 802 if (!r_2->is_valid()) { 803 // must be only an int (or less ) so move only 32bits to slot 804 // why not sign extend?? 805 __ movl(Address(rsp, st_off), r); 806 } else { 807 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 808 // T_DOUBLE and T_LONG use two slots in the interpreter 809 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 810 // long/double in gpr 811 #ifdef ASSERT 812 // Overwrite the unused slot with known junk 813 __ mov64(rax, CONST64(0xdeadffffdeadaaab)); 814 __ movptr(Address(rsp, st_off), rax); 815 #endif /* ASSERT */ 816 __ movq(Address(rsp, next_off), r); 817 } else { 818 __ movptr(Address(rsp, st_off), r); 819 } 820 } 821 } else { 822 assert(r_1->is_XMMRegister(), ""); 823 if (!r_2->is_valid()) { 824 // only a float use just part of the slot 825 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister()); 826 } else { 827 #ifdef ASSERT 828 // Overwrite the unused slot with known junk 829 __ mov64(rax, CONST64(0xdeadffffdeadaaac)); 830 __ movptr(Address(rsp, st_off), rax); 831 #endif /* ASSERT */ 832 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister()); 833 } 834 } 835 } 836 837 // Schedule the branch target address early. 838 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset()))); 839 __ jmp(rcx); 840 } 841 842 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg, 843 address code_start, address code_end, 844 Label& L_ok) { 845 Label L_fail; 846 __ lea(temp_reg, ExternalAddress(code_start)); 847 __ cmpptr(pc_reg, temp_reg); 848 __ jcc(Assembler::belowEqual, L_fail); 849 __ lea(temp_reg, ExternalAddress(code_end)); 850 __ cmpptr(pc_reg, temp_reg); 851 __ jcc(Assembler::below, L_ok); 852 __ bind(L_fail); 853 } 854 855 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm, 856 int total_args_passed, 857 int comp_args_on_stack, 858 const BasicType *sig_bt, 859 const VMRegPair *regs) { 860 861 // Note: r13 contains the senderSP on entry. We must preserve it since 862 // we may do a i2c -> c2i transition if we lose a race where compiled 863 // code goes non-entrant while we get args ready. 864 // In addition we use r13 to locate all the interpreter args as 865 // we must align the stack to 16 bytes on an i2c entry else we 866 // lose alignment we expect in all compiled code and register 867 // save code can segv when fxsave instructions find improperly 868 // aligned stack pointer. 869 870 // Adapters can be frameless because they do not require the caller 871 // to perform additional cleanup work, such as correcting the stack pointer. 872 // An i2c adapter is frameless because the *caller* frame, which is interpreted, 873 // routinely repairs its own stack pointer (from interpreter_frame_last_sp), 874 // even if a callee has modified the stack pointer. 875 // A c2i adapter is frameless because the *callee* frame, which is interpreted, 876 // routinely repairs its caller's stack pointer (from sender_sp, which is set 877 // up via the senderSP register). 878 // In other words, if *either* the caller or callee is interpreted, we can 879 // get the stack pointer repaired after a call. 880 // This is why c2i and i2c adapters cannot be indefinitely composed. 881 // In particular, if a c2i adapter were to somehow call an i2c adapter, 882 // both caller and callee would be compiled methods, and neither would 883 // clean up the stack pointer changes performed by the two adapters. 884 // If this happens, control eventually transfers back to the compiled 885 // caller, but with an uncorrected stack, causing delayed havoc. 886 887 if (VerifyAdapterCalls && 888 (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) { 889 // So, let's test for cascading c2i/i2c adapters right now. 890 // assert(Interpreter::contains($return_addr) || 891 // StubRoutines::contains($return_addr), 892 // "i2c adapter must return to an interpreter frame"); 893 __ block_comment("verify_i2c { "); 894 // Pick up the return address 895 __ movptr(rax, Address(rsp, 0)); 896 Label L_ok; 897 if (Interpreter::code() != nullptr) { 898 range_check(masm, rax, r11, 899 Interpreter::code()->code_start(), 900 Interpreter::code()->code_end(), 901 L_ok); 902 } 903 if (StubRoutines::initial_stubs_code() != nullptr) { 904 range_check(masm, rax, r11, 905 StubRoutines::initial_stubs_code()->code_begin(), 906 StubRoutines::initial_stubs_code()->code_end(), 907 L_ok); 908 } 909 if (StubRoutines::final_stubs_code() != nullptr) { 910 range_check(masm, rax, r11, 911 StubRoutines::final_stubs_code()->code_begin(), 912 StubRoutines::final_stubs_code()->code_end(), 913 L_ok); 914 } 915 const char* msg = "i2c adapter must return to an interpreter frame"; 916 __ block_comment(msg); 917 __ stop(msg); 918 __ bind(L_ok); 919 __ block_comment("} verify_i2ce "); 920 } 921 922 // Must preserve original SP for loading incoming arguments because 923 // we need to align the outgoing SP for compiled code. 924 __ movptr(r11, rsp); 925 926 // Pick up the return address 927 __ pop(rax); 928 929 // Convert 4-byte c2 stack slots to words. 930 int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord; 931 932 if (comp_args_on_stack) { 933 __ subptr(rsp, comp_words_on_stack * wordSize); 934 } 935 936 // Ensure compiled code always sees stack at proper alignment 937 __ andptr(rsp, -16); 938 939 // push the return address and misalign the stack that youngest frame always sees 940 // as far as the placement of the call instruction 941 __ push(rax); 942 943 // Put saved SP in another register 944 const Register saved_sp = rax; 945 __ movptr(saved_sp, r11); 946 947 // Will jump to the compiled code just as if compiled code was doing it. 948 // Pre-load the register-jump target early, to schedule it better. 949 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset()))); 950 951 #if INCLUDE_JVMCI 952 if (EnableJVMCI) { 953 // check if this call should be routed towards a specific entry point 954 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 955 Label no_alternative_target; 956 __ jcc(Assembler::equal, no_alternative_target); 957 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset()))); 958 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 959 __ bind(no_alternative_target); 960 } 961 #endif // INCLUDE_JVMCI 962 963 // Now generate the shuffle code. Pick up all register args and move the 964 // rest through the floating point stack top. 965 for (int i = 0; i < total_args_passed; i++) { 966 if (sig_bt[i] == T_VOID) { 967 // Longs and doubles are passed in native word order, but misaligned 968 // in the 32-bit build. 969 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 970 continue; 971 } 972 973 // Pick up 0, 1 or 2 words from SP+offset. 974 975 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), 976 "scrambled load targets?"); 977 // Load in argument order going down. 978 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize; 979 // Point to interpreter value (vs. tag) 980 int next_off = ld_off - Interpreter::stackElementSize; 981 // 982 // 983 // 984 VMReg r_1 = regs[i].first(); 985 VMReg r_2 = regs[i].second(); 986 if (!r_1->is_valid()) { 987 assert(!r_2->is_valid(), ""); 988 continue; 989 } 990 if (r_1->is_stack()) { 991 // Convert stack slot to an SP offset (+ wordSize to account for return address ) 992 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize; 993 994 // We can use r13 as a temp here because compiled code doesn't need r13 as an input 995 // and if we end up going thru a c2i because of a miss a reasonable value of r13 996 // will be generated. 997 if (!r_2->is_valid()) { 998 // sign extend??? 999 __ movl(r13, Address(saved_sp, ld_off)); 1000 __ movptr(Address(rsp, st_off), r13); 1001 } else { 1002 // 1003 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 1004 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 1005 // So we must adjust where to pick up the data to match the interpreter. 1006 // 1007 // Interpreter local[n] == MSW, local[n+1] == LSW however locals 1008 // are accessed as negative so LSW is at LOW address 1009 1010 // ld_off is MSW so get LSW 1011 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 1012 next_off : ld_off; 1013 __ movq(r13, Address(saved_sp, offset)); 1014 // st_off is LSW (i.e. reg.first()) 1015 __ movq(Address(rsp, st_off), r13); 1016 } 1017 } else if (r_1->is_Register()) { // Register argument 1018 Register r = r_1->as_Register(); 1019 assert(r != rax, "must be different"); 1020 if (r_2->is_valid()) { 1021 // 1022 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 1023 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 1024 // So we must adjust where to pick up the data to match the interpreter. 1025 1026 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 1027 next_off : ld_off; 1028 1029 // this can be a misaligned move 1030 __ movq(r, Address(saved_sp, offset)); 1031 } else { 1032 // sign extend and use a full word? 1033 __ movl(r, Address(saved_sp, ld_off)); 1034 } 1035 } else { 1036 if (!r_2->is_valid()) { 1037 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off)); 1038 } else { 1039 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off)); 1040 } 1041 } 1042 } 1043 1044 __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about 1045 1046 // 6243940 We might end up in handle_wrong_method if 1047 // the callee is deoptimized as we race thru here. If that 1048 // happens we don't want to take a safepoint because the 1049 // caller frame will look interpreted and arguments are now 1050 // "compiled" so it is much better to make this transition 1051 // invisible to the stack walking code. Unfortunately if 1052 // we try and find the callee by normal means a safepoint 1053 // is possible. So we stash the desired callee in the thread 1054 // and the vm will find there should this case occur. 1055 1056 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx); 1057 1058 // put Method* where a c2i would expect should we end up there 1059 // only needed because eof c2 resolve stubs return Method* as a result in 1060 // rax 1061 __ mov(rax, rbx); 1062 __ jmp(r11); 1063 } 1064 1065 // --------------------------------------------------------------- 1066 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm, 1067 int total_args_passed, 1068 int comp_args_on_stack, 1069 const BasicType *sig_bt, 1070 const VMRegPair *regs, 1071 AdapterFingerPrint* fingerprint) { 1072 address i2c_entry = __ pc(); 1073 1074 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs); 1075 1076 // ------------------------------------------------------------------------- 1077 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls 1078 // to the interpreter. The args start out packed in the compiled layout. They 1079 // need to be unpacked into the interpreter layout. This will almost always 1080 // require some stack space. We grow the current (compiled) stack, then repack 1081 // the args. We finally end in a jump to the generic interpreter entry point. 1082 // On exit from the interpreter, the interpreter will restore our SP (lest the 1083 // compiled code, which relies solely on SP and not RBP, get sick). 1084 1085 address c2i_unverified_entry = __ pc(); 1086 Label skip_fixup; 1087 1088 Register data = rax; 1089 Register receiver = j_rarg0; 1090 Register temp = rbx; 1091 1092 { 1093 __ ic_check(1 /* end_alignment */); 1094 __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset())); 1095 // Method might have been compiled since the call site was patched to 1096 // interpreted if that is the case treat it as a miss so we can get 1097 // the call site corrected. 1098 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 1099 __ jcc(Assembler::equal, skip_fixup); 1100 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1101 } 1102 1103 address c2i_entry = __ pc(); 1104 1105 // Class initialization barrier for static methods 1106 address c2i_no_clinit_check_entry = nullptr; 1107 if (VM_Version::supports_fast_class_init_checks()) { 1108 Label L_skip_barrier; 1109 Register method = rbx; 1110 1111 { // Bypass the barrier for non-static methods 1112 Register flags = rscratch1; 1113 __ movl(flags, Address(method, Method::access_flags_offset())); 1114 __ testl(flags, JVM_ACC_STATIC); 1115 __ jcc(Assembler::zero, L_skip_barrier); // non-static 1116 } 1117 1118 Register klass = rscratch1; 1119 __ load_method_holder(klass, method); 1120 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1121 1122 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1123 1124 __ bind(L_skip_barrier); 1125 c2i_no_clinit_check_entry = __ pc(); 1126 } 1127 1128 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1129 bs->c2i_entry_barrier(masm); 1130 1131 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup); 1132 1133 return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry); 1134 } 1135 1136 int SharedRuntime::c_calling_convention(const BasicType *sig_bt, 1137 VMRegPair *regs, 1138 int total_args_passed) { 1139 1140 // We return the amount of VMRegImpl stack slots we need to reserve for all 1141 // the arguments NOT counting out_preserve_stack_slots. 1142 1143 // NOTE: These arrays will have to change when c1 is ported 1144 #ifdef _WIN64 1145 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1146 c_rarg0, c_rarg1, c_rarg2, c_rarg3 1147 }; 1148 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1149 c_farg0, c_farg1, c_farg2, c_farg3 1150 }; 1151 #else 1152 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1153 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5 1154 }; 1155 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1156 c_farg0, c_farg1, c_farg2, c_farg3, 1157 c_farg4, c_farg5, c_farg6, c_farg7 1158 }; 1159 #endif // _WIN64 1160 1161 1162 uint int_args = 0; 1163 uint fp_args = 0; 1164 uint stk_args = 0; // inc by 2 each time 1165 1166 for (int i = 0; i < total_args_passed; i++) { 1167 switch (sig_bt[i]) { 1168 case T_BOOLEAN: 1169 case T_CHAR: 1170 case T_BYTE: 1171 case T_SHORT: 1172 case T_INT: 1173 if (int_args < Argument::n_int_register_parameters_c) { 1174 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 1175 #ifdef _WIN64 1176 fp_args++; 1177 // Allocate slots for callee to stuff register args the stack. 1178 stk_args += 2; 1179 #endif 1180 } else { 1181 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1182 stk_args += 2; 1183 } 1184 break; 1185 case T_LONG: 1186 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1187 // fall through 1188 case T_OBJECT: 1189 case T_ARRAY: 1190 case T_ADDRESS: 1191 case T_METADATA: 1192 if (int_args < Argument::n_int_register_parameters_c) { 1193 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 1194 #ifdef _WIN64 1195 fp_args++; 1196 stk_args += 2; 1197 #endif 1198 } else { 1199 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1200 stk_args += 2; 1201 } 1202 break; 1203 case T_FLOAT: 1204 if (fp_args < Argument::n_float_register_parameters_c) { 1205 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 1206 #ifdef _WIN64 1207 int_args++; 1208 // Allocate slots for callee to stuff register args the stack. 1209 stk_args += 2; 1210 #endif 1211 } else { 1212 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1213 stk_args += 2; 1214 } 1215 break; 1216 case T_DOUBLE: 1217 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1218 if (fp_args < Argument::n_float_register_parameters_c) { 1219 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 1220 #ifdef _WIN64 1221 int_args++; 1222 // Allocate slots for callee to stuff register args the stack. 1223 stk_args += 2; 1224 #endif 1225 } else { 1226 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1227 stk_args += 2; 1228 } 1229 break; 1230 case T_VOID: // Halves of longs and doubles 1231 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 1232 regs[i].set_bad(); 1233 break; 1234 default: 1235 ShouldNotReachHere(); 1236 break; 1237 } 1238 } 1239 #ifdef _WIN64 1240 // windows abi requires that we always allocate enough stack space 1241 // for 4 64bit registers to be stored down. 1242 if (stk_args < 8) { 1243 stk_args = 8; 1244 } 1245 #endif // _WIN64 1246 1247 return stk_args; 1248 } 1249 1250 int SharedRuntime::vector_calling_convention(VMRegPair *regs, 1251 uint num_bits, 1252 uint total_args_passed) { 1253 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512, 1254 "only certain vector sizes are supported for now"); 1255 1256 static const XMMRegister VEC_ArgReg[32] = { 1257 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 1258 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1259 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23, 1260 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31 1261 }; 1262 1263 uint stk_args = 0; 1264 uint fp_args = 0; 1265 1266 for (uint i = 0; i < total_args_passed; i++) { 1267 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg(); 1268 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15)); 1269 regs[i].set_pair(vmreg->next(next_val), vmreg); 1270 } 1271 1272 return stk_args; 1273 } 1274 1275 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1276 // We always ignore the frame_slots arg and just use the space just below frame pointer 1277 // which by this time is free to use 1278 switch (ret_type) { 1279 case T_FLOAT: 1280 __ movflt(Address(rbp, -wordSize), xmm0); 1281 break; 1282 case T_DOUBLE: 1283 __ movdbl(Address(rbp, -wordSize), xmm0); 1284 break; 1285 case T_VOID: break; 1286 default: { 1287 __ movptr(Address(rbp, -wordSize), rax); 1288 } 1289 } 1290 } 1291 1292 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1293 // We always ignore the frame_slots arg and just use the space just below frame pointer 1294 // which by this time is free to use 1295 switch (ret_type) { 1296 case T_FLOAT: 1297 __ movflt(xmm0, Address(rbp, -wordSize)); 1298 break; 1299 case T_DOUBLE: 1300 __ movdbl(xmm0, Address(rbp, -wordSize)); 1301 break; 1302 case T_VOID: break; 1303 default: { 1304 __ movptr(rax, Address(rbp, -wordSize)); 1305 } 1306 } 1307 } 1308 1309 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1310 for ( int i = first_arg ; i < arg_count ; i++ ) { 1311 if (args[i].first()->is_Register()) { 1312 __ push(args[i].first()->as_Register()); 1313 } else if (args[i].first()->is_XMMRegister()) { 1314 __ subptr(rsp, 2*wordSize); 1315 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister()); 1316 } 1317 } 1318 } 1319 1320 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1321 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) { 1322 if (args[i].first()->is_Register()) { 1323 __ pop(args[i].first()->as_Register()); 1324 } else if (args[i].first()->is_XMMRegister()) { 1325 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0)); 1326 __ addptr(rsp, 2*wordSize); 1327 } 1328 } 1329 } 1330 1331 static void verify_oop_args(MacroAssembler* masm, 1332 const methodHandle& method, 1333 const BasicType* sig_bt, 1334 const VMRegPair* regs) { 1335 Register temp_reg = rbx; // not part of any compiled calling seq 1336 if (VerifyOops) { 1337 for (int i = 0; i < method->size_of_parameters(); i++) { 1338 if (is_reference_type(sig_bt[i])) { 1339 VMReg r = regs[i].first(); 1340 assert(r->is_valid(), "bad oop arg"); 1341 if (r->is_stack()) { 1342 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1343 __ verify_oop(temp_reg); 1344 } else { 1345 __ verify_oop(r->as_Register()); 1346 } 1347 } 1348 } 1349 } 1350 } 1351 1352 static void check_continuation_enter_argument(VMReg actual_vmreg, 1353 Register expected_reg, 1354 const char* name) { 1355 assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name); 1356 assert(actual_vmreg->as_Register() == expected_reg, 1357 "%s is in unexpected register: %s instead of %s", 1358 name, actual_vmreg->as_Register()->name(), expected_reg->name()); 1359 } 1360 1361 1362 //---------------------------- continuation_enter_setup --------------------------- 1363 // 1364 // Arguments: 1365 // None. 1366 // 1367 // Results: 1368 // rsp: pointer to blank ContinuationEntry 1369 // 1370 // Kills: 1371 // rax 1372 // 1373 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) { 1374 assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, ""); 1375 assert(in_bytes(ContinuationEntry::cont_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1376 assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1377 1378 stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize; 1379 __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1380 1381 int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size; 1382 OopMap* map = new OopMap(frame_size, 0); 1383 1384 __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset())); 1385 __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax); 1386 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp); 1387 1388 return map; 1389 } 1390 1391 //---------------------------- fill_continuation_entry --------------------------- 1392 // 1393 // Arguments: 1394 // rsp: pointer to blank Continuation entry 1395 // reg_cont_obj: pointer to the continuation 1396 // reg_flags: flags 1397 // 1398 // Results: 1399 // rsp: pointer to filled out ContinuationEntry 1400 // 1401 // Kills: 1402 // rax 1403 // 1404 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) { 1405 assert_different_registers(rax, reg_cont_obj, reg_flags); 1406 #ifdef ASSERT 1407 __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value()); 1408 #endif 1409 __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj); 1410 __ movl (Address(rsp, ContinuationEntry::flags_offset()), reg_flags); 1411 __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0); 1412 __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0); 1413 __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0); 1414 1415 __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset())); 1416 __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax); 1417 __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset())); 1418 __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax); 1419 1420 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0); 1421 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0); 1422 } 1423 1424 //---------------------------- continuation_enter_cleanup --------------------------- 1425 // 1426 // Arguments: 1427 // rsp: pointer to the ContinuationEntry 1428 // 1429 // Results: 1430 // rsp: pointer to the spilled rbp in the entry frame 1431 // 1432 // Kills: 1433 // rbx 1434 // 1435 static void continuation_enter_cleanup(MacroAssembler* masm) { 1436 #ifdef ASSERT 1437 Label L_good_sp; 1438 __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1439 __ jcc(Assembler::equal, L_good_sp); 1440 __ stop("Incorrect rsp at continuation_enter_cleanup"); 1441 __ bind(L_good_sp); 1442 #endif 1443 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset())); 1444 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx); 1445 1446 if (CheckJNICalls) { 1447 // Check if this is a virtual thread continuation 1448 Label L_skip_vthread_code; 1449 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0); 1450 __ jcc(Assembler::equal, L_skip_vthread_code); 1451 1452 // If the held monitor count is > 0 and this vthread is terminating then 1453 // it failed to release a JNI monitor. So we issue the same log message 1454 // that JavaThread::exit does. 1455 __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1456 __ jcc(Assembler::equal, L_skip_vthread_code); 1457 1458 // rax may hold an exception oop, save it before the call 1459 __ push(rax); 1460 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held)); 1461 __ pop(rax); 1462 1463 // For vthreads we have to explicitly zero the JNI monitor count of the carrier 1464 // on termination. The held count is implicitly zeroed below when we restore from 1465 // the parent held count (which has to be zero). 1466 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1467 1468 __ bind(L_skip_vthread_code); 1469 } 1470 #ifdef ASSERT 1471 else { 1472 // Check if this is a virtual thread continuation 1473 Label L_skip_vthread_code; 1474 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0); 1475 __ jcc(Assembler::equal, L_skip_vthread_code); 1476 1477 // See comment just above. If not checking JNI calls the JNI count is only 1478 // needed for assertion checking. 1479 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1480 1481 __ bind(L_skip_vthread_code); 1482 } 1483 #endif 1484 1485 __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset())); 1486 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx); 1487 1488 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset())); 1489 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx); 1490 __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1491 } 1492 1493 static void gen_continuation_enter(MacroAssembler* masm, 1494 const VMRegPair* regs, 1495 int& exception_offset, 1496 OopMapSet* oop_maps, 1497 int& frame_complete, 1498 int& stack_slots, 1499 int& interpreted_entry_offset, 1500 int& compiled_entry_offset) { 1501 1502 // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread) 1503 int pos_cont_obj = 0; 1504 int pos_is_cont = 1; 1505 int pos_is_virtual = 2; 1506 1507 // The platform-specific calling convention may present the arguments in various registers. 1508 // To simplify the rest of the code, we expect the arguments to reside at these known 1509 // registers, and we additionally check the placement here in case calling convention ever 1510 // changes. 1511 Register reg_cont_obj = c_rarg1; 1512 Register reg_is_cont = c_rarg2; 1513 Register reg_is_virtual = c_rarg3; 1514 1515 check_continuation_enter_argument(regs[pos_cont_obj].first(), reg_cont_obj, "Continuation object"); 1516 check_continuation_enter_argument(regs[pos_is_cont].first(), reg_is_cont, "isContinue"); 1517 check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread"); 1518 1519 // Utility methods kill rax, make sure there are no collisions 1520 assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual); 1521 1522 AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(), 1523 relocInfo::static_call_type); 1524 1525 address start = __ pc(); 1526 1527 Label L_thaw, L_exit; 1528 1529 // i2i entry used at interp_only_mode only 1530 interpreted_entry_offset = __ pc() - start; 1531 { 1532 #ifdef ASSERT 1533 Label is_interp_only; 1534 __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0); 1535 __ jcc(Assembler::notEqual, is_interp_only); 1536 __ stop("enterSpecial interpreter entry called when not in interp_only_mode"); 1537 __ bind(is_interp_only); 1538 #endif 1539 1540 __ pop(rax); // return address 1541 // Read interpreter arguments into registers (this is an ad-hoc i2c adapter) 1542 __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2)); 1543 __ movl(c_rarg2, Address(rsp, Interpreter::stackElementSize*1)); 1544 __ movl(c_rarg3, Address(rsp, Interpreter::stackElementSize*0)); 1545 __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment 1546 __ push(rax); // return address 1547 __ push_cont_fastpath(); 1548 1549 __ enter(); 1550 1551 stack_slots = 2; // will be adjusted in setup 1552 OopMap* map = continuation_enter_setup(masm, stack_slots); 1553 // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe, 1554 // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway. 1555 1556 __ verify_oop(reg_cont_obj); 1557 1558 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1559 1560 // If continuation, call to thaw. Otherwise, resolve the call and exit. 1561 __ testptr(reg_is_cont, reg_is_cont); 1562 __ jcc(Assembler::notZero, L_thaw); 1563 1564 // --- Resolve path 1565 1566 // Make sure the call is patchable 1567 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1568 // Emit stub for static call 1569 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc()); 1570 if (stub == nullptr) { 1571 fatal("CodeCache is full at gen_continuation_enter"); 1572 } 1573 __ call(resolve); 1574 oop_maps->add_gc_map(__ pc() - start, map); 1575 __ post_call_nop(); 1576 1577 __ jmp(L_exit); 1578 } 1579 1580 // compiled entry 1581 __ align(CodeEntryAlignment); 1582 compiled_entry_offset = __ pc() - start; 1583 __ enter(); 1584 1585 stack_slots = 2; // will be adjusted in setup 1586 OopMap* map = continuation_enter_setup(masm, stack_slots); 1587 1588 // Frame is now completed as far as size and linkage. 1589 frame_complete = __ pc() - start; 1590 1591 __ verify_oop(reg_cont_obj); 1592 1593 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1594 1595 // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue) 1596 __ testptr(reg_is_cont, reg_is_cont); 1597 __ jccb(Assembler::notZero, L_thaw); 1598 1599 // --- call Continuation.enter(Continuation c, boolean isContinue) 1600 1601 // Make sure the call is patchable 1602 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1603 1604 // Emit stub for static call 1605 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc()); 1606 if (stub == nullptr) { 1607 fatal("CodeCache is full at gen_continuation_enter"); 1608 } 1609 1610 // The call needs to be resolved. There's a special case for this in 1611 // SharedRuntime::find_callee_info_helper() which calls 1612 // LinkResolver::resolve_continuation_enter() which resolves the call to 1613 // Continuation.enter(Continuation c, boolean isContinue). 1614 __ call(resolve); 1615 1616 oop_maps->add_gc_map(__ pc() - start, map); 1617 __ post_call_nop(); 1618 1619 __ jmpb(L_exit); 1620 1621 // --- Thawing path 1622 1623 __ bind(L_thaw); 1624 1625 ContinuationEntry::_thaw_call_pc_offset = __ pc() - start; 1626 __ call(RuntimeAddress(StubRoutines::cont_thaw())); 1627 1628 ContinuationEntry::_return_pc_offset = __ pc() - start; 1629 oop_maps->add_gc_map(__ pc() - start, map->deep_copy()); 1630 __ post_call_nop(); 1631 1632 // --- Normal exit (resolve/thawing) 1633 1634 __ bind(L_exit); 1635 1636 continuation_enter_cleanup(masm); 1637 __ pop(rbp); 1638 __ ret(0); 1639 1640 // --- Exception handling path 1641 1642 exception_offset = __ pc() - start; 1643 1644 continuation_enter_cleanup(masm); 1645 __ pop(rbp); 1646 1647 __ movptr(c_rarg0, r15_thread); 1648 __ movptr(c_rarg1, Address(rsp, 0)); // return address 1649 1650 // rax still holds the original exception oop, save it before the call 1651 __ push(rax); 1652 1653 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2); 1654 __ movptr(rbx, rax); 1655 1656 // Continue at exception handler: 1657 // rax: exception oop 1658 // rbx: exception handler 1659 // rdx: exception pc 1660 __ pop(rax); 1661 __ verify_oop(rax); 1662 __ pop(rdx); 1663 __ jmp(rbx); 1664 } 1665 1666 static void gen_continuation_yield(MacroAssembler* masm, 1667 const VMRegPair* regs, 1668 OopMapSet* oop_maps, 1669 int& frame_complete, 1670 int& stack_slots, 1671 int& compiled_entry_offset) { 1672 enum layout { 1673 rbp_off, 1674 rbpH_off, 1675 return_off, 1676 return_off2, 1677 framesize // inclusive of return address 1678 }; 1679 stack_slots = framesize / VMRegImpl::slots_per_word; 1680 assert(stack_slots == 2, "recheck layout"); 1681 1682 address start = __ pc(); 1683 compiled_entry_offset = __ pc() - start; 1684 __ enter(); 1685 address the_pc = __ pc(); 1686 1687 frame_complete = the_pc - start; 1688 1689 // This nop must be exactly at the PC we push into the frame info. 1690 // We use this nop for fast CodeBlob lookup, associate the OopMap 1691 // with it right away. 1692 __ post_call_nop(); 1693 OopMap* map = new OopMap(framesize, 1); 1694 oop_maps->add_gc_map(frame_complete, map); 1695 1696 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 1697 __ movptr(c_rarg0, r15_thread); 1698 __ movptr(c_rarg1, rsp); 1699 __ call_VM_leaf(Continuation::freeze_entry(), 2); 1700 __ reset_last_Java_frame(true); 1701 1702 Label L_pinned; 1703 1704 __ testptr(rax, rax); 1705 __ jcc(Assembler::notZero, L_pinned); 1706 1707 __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1708 continuation_enter_cleanup(masm); 1709 __ pop(rbp); 1710 __ ret(0); 1711 1712 __ bind(L_pinned); 1713 1714 // Pinned, return to caller 1715 1716 // handle pending exception thrown by freeze 1717 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 1718 Label ok; 1719 __ jcc(Assembler::equal, ok); 1720 __ leave(); 1721 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 1722 __ bind(ok); 1723 1724 __ leave(); 1725 __ ret(0); 1726 } 1727 1728 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) { 1729 ::continuation_enter_cleanup(masm); 1730 } 1731 1732 static void gen_special_dispatch(MacroAssembler* masm, 1733 const methodHandle& method, 1734 const BasicType* sig_bt, 1735 const VMRegPair* regs) { 1736 verify_oop_args(masm, method, sig_bt, regs); 1737 vmIntrinsics::ID iid = method->intrinsic_id(); 1738 1739 // Now write the args into the outgoing interpreter space 1740 bool has_receiver = false; 1741 Register receiver_reg = noreg; 1742 int member_arg_pos = -1; 1743 Register member_reg = noreg; 1744 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid); 1745 if (ref_kind != 0) { 1746 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument 1747 member_reg = rbx; // known to be free at this point 1748 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind); 1749 } else if (iid == vmIntrinsics::_invokeBasic) { 1750 has_receiver = true; 1751 } else if (iid == vmIntrinsics::_linkToNative) { 1752 member_arg_pos = method->size_of_parameters() - 1; // trailing NativeEntryPoint argument 1753 member_reg = rbx; // known to be free at this point 1754 } else { 1755 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid)); 1756 } 1757 1758 if (member_reg != noreg) { 1759 // Load the member_arg into register, if necessary. 1760 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs); 1761 VMReg r = regs[member_arg_pos].first(); 1762 if (r->is_stack()) { 1763 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1764 } else { 1765 // no data motion is needed 1766 member_reg = r->as_Register(); 1767 } 1768 } 1769 1770 if (has_receiver) { 1771 // Make sure the receiver is loaded into a register. 1772 assert(method->size_of_parameters() > 0, "oob"); 1773 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object"); 1774 VMReg r = regs[0].first(); 1775 assert(r->is_valid(), "bad receiver arg"); 1776 if (r->is_stack()) { 1777 // Porting note: This assumes that compiled calling conventions always 1778 // pass the receiver oop in a register. If this is not true on some 1779 // platform, pick a temp and load the receiver from stack. 1780 fatal("receiver always in a register"); 1781 receiver_reg = j_rarg0; // known to be free at this point 1782 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1783 } else { 1784 // no data motion is needed 1785 receiver_reg = r->as_Register(); 1786 } 1787 } 1788 1789 // Figure out which address we are really jumping to: 1790 MethodHandles::generate_method_handle_dispatch(masm, iid, 1791 receiver_reg, member_reg, /*for_compiler_entry:*/ true); 1792 } 1793 1794 // --------------------------------------------------------------------------- 1795 // Generate a native wrapper for a given method. The method takes arguments 1796 // in the Java compiled code convention, marshals them to the native 1797 // convention (handlizes oops, etc), transitions to native, makes the call, 1798 // returns to java state (possibly blocking), unhandlizes any result and 1799 // returns. 1800 // 1801 // Critical native functions are a shorthand for the use of 1802 // GetPrimtiveArrayCritical and disallow the use of any other JNI 1803 // functions. The wrapper is expected to unpack the arguments before 1804 // passing them to the callee. Critical native functions leave the state _in_Java, 1805 // since they cannot stop for GC. 1806 // Some other parts of JNI setup are skipped like the tear down of the JNI handle 1807 // block and the check for pending exceptions it's impossible for them 1808 // to be thrown. 1809 // 1810 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm, 1811 const methodHandle& method, 1812 int compile_id, 1813 BasicType* in_sig_bt, 1814 VMRegPair* in_regs, 1815 BasicType ret_type) { 1816 if (method->is_continuation_native_intrinsic()) { 1817 int exception_offset = -1; 1818 OopMapSet* oop_maps = new OopMapSet(); 1819 int frame_complete = -1; 1820 int stack_slots = -1; 1821 int interpreted_entry_offset = -1; 1822 int vep_offset = -1; 1823 if (method->is_continuation_enter_intrinsic()) { 1824 gen_continuation_enter(masm, 1825 in_regs, 1826 exception_offset, 1827 oop_maps, 1828 frame_complete, 1829 stack_slots, 1830 interpreted_entry_offset, 1831 vep_offset); 1832 } else if (method->is_continuation_yield_intrinsic()) { 1833 gen_continuation_yield(masm, 1834 in_regs, 1835 oop_maps, 1836 frame_complete, 1837 stack_slots, 1838 vep_offset); 1839 } else { 1840 guarantee(false, "Unknown Continuation native intrinsic"); 1841 } 1842 1843 #ifdef ASSERT 1844 if (method->is_continuation_enter_intrinsic()) { 1845 assert(interpreted_entry_offset != -1, "Must be set"); 1846 assert(exception_offset != -1, "Must be set"); 1847 } else { 1848 assert(interpreted_entry_offset == -1, "Must be unset"); 1849 assert(exception_offset == -1, "Must be unset"); 1850 } 1851 assert(frame_complete != -1, "Must be set"); 1852 assert(stack_slots != -1, "Must be set"); 1853 assert(vep_offset != -1, "Must be set"); 1854 #endif 1855 1856 __ flush(); 1857 nmethod* nm = nmethod::new_native_nmethod(method, 1858 compile_id, 1859 masm->code(), 1860 vep_offset, 1861 frame_complete, 1862 stack_slots, 1863 in_ByteSize(-1), 1864 in_ByteSize(-1), 1865 oop_maps, 1866 exception_offset); 1867 if (nm == nullptr) return nm; 1868 if (method->is_continuation_enter_intrinsic()) { 1869 ContinuationEntry::set_enter_code(nm, interpreted_entry_offset); 1870 } else if (method->is_continuation_yield_intrinsic()) { 1871 _cont_doYield_stub = nm; 1872 } 1873 return nm; 1874 } 1875 1876 if (method->is_method_handle_intrinsic()) { 1877 vmIntrinsics::ID iid = method->intrinsic_id(); 1878 intptr_t start = (intptr_t)__ pc(); 1879 int vep_offset = ((intptr_t)__ pc()) - start; 1880 gen_special_dispatch(masm, 1881 method, 1882 in_sig_bt, 1883 in_regs); 1884 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period 1885 __ flush(); 1886 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually 1887 return nmethod::new_native_nmethod(method, 1888 compile_id, 1889 masm->code(), 1890 vep_offset, 1891 frame_complete, 1892 stack_slots / VMRegImpl::slots_per_word, 1893 in_ByteSize(-1), 1894 in_ByteSize(-1), 1895 nullptr); 1896 } 1897 address native_func = method->native_function(); 1898 assert(native_func != nullptr, "must have function"); 1899 1900 // An OopMap for lock (and class if static) 1901 OopMapSet *oop_maps = new OopMapSet(); 1902 intptr_t start = (intptr_t)__ pc(); 1903 1904 // We have received a description of where all the java arg are located 1905 // on entry to the wrapper. We need to convert these args to where 1906 // the jni function will expect them. To figure out where they go 1907 // we convert the java signature to a C signature by inserting 1908 // the hidden arguments as arg[0] and possibly arg[1] (static method) 1909 1910 const int total_in_args = method->size_of_parameters(); 1911 int total_c_args = total_in_args + (method->is_static() ? 2 : 1); 1912 1913 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args); 1914 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args); 1915 BasicType* in_elem_bt = nullptr; 1916 1917 int argc = 0; 1918 out_sig_bt[argc++] = T_ADDRESS; 1919 if (method->is_static()) { 1920 out_sig_bt[argc++] = T_OBJECT; 1921 } 1922 1923 for (int i = 0; i < total_in_args ; i++ ) { 1924 out_sig_bt[argc++] = in_sig_bt[i]; 1925 } 1926 1927 // Now figure out where the args must be stored and how much stack space 1928 // they require. 1929 int out_arg_slots; 1930 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args); 1931 1932 // Compute framesize for the wrapper. We need to handlize all oops in 1933 // incoming registers 1934 1935 // Calculate the total number of stack slots we will need. 1936 1937 // First count the abi requirement plus all of the outgoing args 1938 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots; 1939 1940 // Now the space for the inbound oop handle area 1941 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers 1942 1943 int oop_handle_offset = stack_slots; 1944 stack_slots += total_save_slots; 1945 1946 // Now any space we need for handlizing a klass if static method 1947 1948 int klass_slot_offset = 0; 1949 int klass_offset = -1; 1950 int lock_slot_offset = 0; 1951 bool is_static = false; 1952 1953 if (method->is_static()) { 1954 klass_slot_offset = stack_slots; 1955 stack_slots += VMRegImpl::slots_per_word; 1956 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size; 1957 is_static = true; 1958 } 1959 1960 // Plus a lock if needed 1961 1962 if (method->is_synchronized()) { 1963 lock_slot_offset = stack_slots; 1964 stack_slots += VMRegImpl::slots_per_word; 1965 } 1966 1967 // Now a place (+2) to save return values or temp during shuffling 1968 // + 4 for return address (which we own) and saved rbp 1969 stack_slots += 6; 1970 1971 // Ok The space we have allocated will look like: 1972 // 1973 // 1974 // FP-> | | 1975 // |---------------------| 1976 // | 2 slots for moves | 1977 // |---------------------| 1978 // | lock box (if sync) | 1979 // |---------------------| <- lock_slot_offset 1980 // | klass (if static) | 1981 // |---------------------| <- klass_slot_offset 1982 // | oopHandle area | 1983 // |---------------------| <- oop_handle_offset (6 java arg registers) 1984 // | outbound memory | 1985 // | based arguments | 1986 // | | 1987 // |---------------------| 1988 // | | 1989 // SP-> | out_preserved_slots | 1990 // 1991 // 1992 1993 1994 // Now compute actual number of stack words we need rounding to make 1995 // stack properly aligned. 1996 stack_slots = align_up(stack_slots, StackAlignmentInSlots); 1997 1998 int stack_size = stack_slots * VMRegImpl::stack_slot_size; 1999 2000 // First thing make an ic check to see if we should even be here 2001 2002 // We are free to use all registers as temps without saving them and 2003 // restoring them except rbp. rbp is the only callee save register 2004 // as far as the interpreter and the compiler(s) are concerned. 2005 2006 const Register receiver = j_rarg0; 2007 2008 Label exception_pending; 2009 2010 assert_different_registers(receiver, rscratch1, rscratch2); 2011 __ verify_oop(receiver); 2012 __ ic_check(8 /* end_alignment */); 2013 2014 int vep_offset = ((intptr_t)__ pc()) - start; 2015 2016 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) { 2017 Label L_skip_barrier; 2018 Register klass = r10; 2019 __ mov_metadata(klass, method->method_holder()); // InstanceKlass* 2020 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 2021 2022 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 2023 2024 __ bind(L_skip_barrier); 2025 } 2026 2027 #ifdef COMPILER1 2028 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available. 2029 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) { 2030 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/); 2031 } 2032 #endif // COMPILER1 2033 2034 // The instruction at the verified entry point must be 5 bytes or longer 2035 // because it can be patched on the fly by make_non_entrant. The stack bang 2036 // instruction fits that requirement. 2037 2038 // Generate stack overflow check 2039 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size()); 2040 2041 // Generate a new frame for the wrapper. 2042 __ enter(); 2043 // -2 because return address is already present and so is saved rbp 2044 __ subptr(rsp, stack_size - 2*wordSize); 2045 2046 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2047 // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub 2048 bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */); 2049 2050 // Frame is now completed as far as size and linkage. 2051 int frame_complete = ((intptr_t)__ pc()) - start; 2052 2053 #ifdef ASSERT 2054 __ check_stack_alignment(rsp, "improperly aligned stack"); 2055 #endif /* ASSERT */ 2056 2057 2058 // We use r14 as the oop handle for the receiver/klass 2059 // It is callee save so it survives the call to native 2060 2061 const Register oop_handle_reg = r14; 2062 2063 // 2064 // We immediately shuffle the arguments so that any vm call we have to 2065 // make from here on out (sync slow path, jvmti, etc.) we will have 2066 // captured the oops from our caller and have a valid oopMap for 2067 // them. 2068 2069 // ----------------- 2070 // The Grand Shuffle 2071 2072 // The Java calling convention is either equal (linux) or denser (win64) than the 2073 // c calling convention. However the because of the jni_env argument the c calling 2074 // convention always has at least one more (and two for static) arguments than Java. 2075 // Therefore if we move the args from java -> c backwards then we will never have 2076 // a register->register conflict and we don't have to build a dependency graph 2077 // and figure out how to break any cycles. 2078 // 2079 2080 // Record esp-based slot for receiver on stack for non-static methods 2081 int receiver_offset = -1; 2082 2083 // This is a trick. We double the stack slots so we can claim 2084 // the oops in the caller's frame. Since we are sure to have 2085 // more args than the caller doubling is enough to make 2086 // sure we can capture all the incoming oop args from the 2087 // caller. 2088 // 2089 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/); 2090 2091 // Mark location of rbp (someday) 2092 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp)); 2093 2094 // Use eax, ebx as temporaries during any memory-memory moves we have to do 2095 // All inbound args are referenced based on rbp and all outbound args via rsp. 2096 2097 2098 #ifdef ASSERT 2099 bool reg_destroyed[Register::number_of_registers]; 2100 bool freg_destroyed[XMMRegister::number_of_registers]; 2101 for ( int r = 0 ; r < Register::number_of_registers ; r++ ) { 2102 reg_destroyed[r] = false; 2103 } 2104 for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) { 2105 freg_destroyed[f] = false; 2106 } 2107 2108 #endif /* ASSERT */ 2109 2110 // For JNI natives the incoming and outgoing registers are offset upwards. 2111 GrowableArray<int> arg_order(2 * total_in_args); 2112 2113 VMRegPair tmp_vmreg; 2114 tmp_vmreg.set2(rbx->as_VMReg()); 2115 2116 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 2117 arg_order.push(i); 2118 arg_order.push(c_arg); 2119 } 2120 2121 int temploc = -1; 2122 for (int ai = 0; ai < arg_order.length(); ai += 2) { 2123 int i = arg_order.at(ai); 2124 int c_arg = arg_order.at(ai + 1); 2125 __ block_comment(err_msg("move %d -> %d", i, c_arg)); 2126 #ifdef ASSERT 2127 if (in_regs[i].first()->is_Register()) { 2128 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!"); 2129 } else if (in_regs[i].first()->is_XMMRegister()) { 2130 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!"); 2131 } 2132 if (out_regs[c_arg].first()->is_Register()) { 2133 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 2134 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 2135 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 2136 } 2137 #endif /* ASSERT */ 2138 switch (in_sig_bt[i]) { 2139 case T_ARRAY: 2140 case T_OBJECT: 2141 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg], 2142 ((i == 0) && (!is_static)), 2143 &receiver_offset); 2144 break; 2145 case T_VOID: 2146 break; 2147 2148 case T_FLOAT: 2149 __ float_move(in_regs[i], out_regs[c_arg]); 2150 break; 2151 2152 case T_DOUBLE: 2153 assert( i + 1 < total_in_args && 2154 in_sig_bt[i + 1] == T_VOID && 2155 out_sig_bt[c_arg+1] == T_VOID, "bad arg list"); 2156 __ double_move(in_regs[i], out_regs[c_arg]); 2157 break; 2158 2159 case T_LONG : 2160 __ long_move(in_regs[i], out_regs[c_arg]); 2161 break; 2162 2163 case T_ADDRESS: assert(false, "found T_ADDRESS in java args"); 2164 2165 default: 2166 __ move32_64(in_regs[i], out_regs[c_arg]); 2167 } 2168 } 2169 2170 int c_arg; 2171 2172 // Pre-load a static method's oop into r14. Used both by locking code and 2173 // the normal JNI call code. 2174 // point c_arg at the first arg that is already loaded in case we 2175 // need to spill before we call out 2176 c_arg = total_c_args - total_in_args; 2177 2178 if (method->is_static()) { 2179 2180 // load oop into a register 2181 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror())); 2182 2183 // Now handlize the static class mirror it's known not-null. 2184 __ movptr(Address(rsp, klass_offset), oop_handle_reg); 2185 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset)); 2186 2187 // Now get the handle 2188 __ lea(oop_handle_reg, Address(rsp, klass_offset)); 2189 // store the klass handle as second argument 2190 __ movptr(c_rarg1, oop_handle_reg); 2191 // and protect the arg if we must spill 2192 c_arg--; 2193 } 2194 2195 // Change state to native (we save the return address in the thread, since it might not 2196 // be pushed on the stack when we do a stack traversal). It is enough that the pc() 2197 // points into the right code segment. It does not have to be the correct return pc. 2198 // We use the same pc/oopMap repeatedly when we call out 2199 2200 intptr_t the_pc = (intptr_t) __ pc(); 2201 oop_maps->add_gc_map(the_pc - start, map); 2202 2203 __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1); 2204 2205 2206 // We have all of the arguments setup at this point. We must not touch any register 2207 // argument registers at this point (what if we save/restore them there are no oop? 2208 2209 if (DTraceMethodProbes) { 2210 // protect the args we've loaded 2211 save_args(masm, total_c_args, c_arg, out_regs); 2212 __ mov_metadata(c_rarg1, method()); 2213 __ call_VM_leaf( 2214 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry), 2215 r15_thread, c_rarg1); 2216 restore_args(masm, total_c_args, c_arg, out_regs); 2217 } 2218 2219 // RedefineClasses() tracing support for obsolete method entry 2220 if (log_is_enabled(Trace, redefine, class, obsolete)) { 2221 // protect the args we've loaded 2222 save_args(masm, total_c_args, c_arg, out_regs); 2223 __ mov_metadata(c_rarg1, method()); 2224 __ call_VM_leaf( 2225 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry), 2226 r15_thread, c_rarg1); 2227 restore_args(masm, total_c_args, c_arg, out_regs); 2228 } 2229 2230 // Lock a synchronized method 2231 2232 // Register definitions used by locking and unlocking 2233 2234 const Register swap_reg = rax; // Must use rax for cmpxchg instruction 2235 const Register obj_reg = rbx; // Will contain the oop 2236 const Register lock_reg = r13; // Address of compiler lock object (BasicLock) 2237 const Register old_hdr = r13; // value of old header at unlock time 2238 2239 Label slow_path_lock; 2240 Label lock_done; 2241 2242 if (method->is_synchronized()) { 2243 Label count_mon; 2244 2245 const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes(); 2246 2247 // Get the handle (the 2nd argument) 2248 __ mov(oop_handle_reg, c_rarg1); 2249 2250 // Get address of the box 2251 2252 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2253 2254 // Load the oop from the handle 2255 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2256 2257 if (LockingMode == LM_MONITOR) { 2258 __ jmp(slow_path_lock); 2259 } else if (LockingMode == LM_LEGACY) { 2260 // Load immediate 1 into swap_reg %rax 2261 __ movl(swap_reg, 1); 2262 2263 // Load (object->mark() | 1) into swap_reg %rax 2264 __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2265 2266 // Save (object->mark() | 1) into BasicLock's displaced header 2267 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2268 2269 // src -> dest iff dest == rax else rax <- dest 2270 __ lock(); 2271 __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2272 __ jcc(Assembler::equal, count_mon); 2273 2274 // Hmm should this move to the slow path code area??? 2275 2276 // Test if the oopMark is an obvious stack pointer, i.e., 2277 // 1) (mark & 3) == 0, and 2278 // 2) rsp <= mark < mark + os::pagesize() 2279 // These 3 tests can be done by evaluating the following 2280 // expression: ((mark - rsp) & (3 - os::vm_page_size())), 2281 // assuming both stack pointer and pagesize have their 2282 // least significant 2 bits clear. 2283 // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg 2284 2285 __ subptr(swap_reg, rsp); 2286 __ andptr(swap_reg, 3 - (int)os::vm_page_size()); 2287 2288 // Save the test result, for recursive case, the result is zero 2289 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2290 __ jcc(Assembler::notEqual, slow_path_lock); 2291 } else { 2292 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2293 __ lightweight_lock(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock); 2294 } 2295 __ jmp (lock_done); 2296 2297 __ bind(count_mon); 2298 __ inc_held_monitor_count(); 2299 2300 // Slow path will re-enter here 2301 __ bind(lock_done); 2302 } 2303 2304 // Finally just about ready to make the JNI call 2305 2306 // get JNIEnv* which is first argument to native 2307 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset()))); 2308 2309 // Now set thread in native 2310 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 2311 2312 __ call(RuntimeAddress(native_func)); 2313 2314 // Verify or restore cpu control state after JNI call 2315 __ restore_cpu_control_state_after_jni(rscratch1); 2316 2317 // Unpack native results. 2318 switch (ret_type) { 2319 case T_BOOLEAN: __ c2bool(rax); break; 2320 case T_CHAR : __ movzwl(rax, rax); break; 2321 case T_BYTE : __ sign_extend_byte (rax); break; 2322 case T_SHORT : __ sign_extend_short(rax); break; 2323 case T_INT : /* nothing to do */ break; 2324 case T_DOUBLE : 2325 case T_FLOAT : 2326 // Result is in xmm0 we'll save as needed 2327 break; 2328 case T_ARRAY: // Really a handle 2329 case T_OBJECT: // Really a handle 2330 break; // can't de-handlize until after safepoint check 2331 case T_VOID: break; 2332 case T_LONG: break; 2333 default : ShouldNotReachHere(); 2334 } 2335 2336 Label after_transition; 2337 2338 // Switch thread to "native transition" state before reading the synchronization state. 2339 // This additional state is necessary because reading and testing the synchronization 2340 // state is not atomic w.r.t. GC, as this scenario demonstrates: 2341 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted. 2342 // VM thread changes sync state to synchronizing and suspends threads for GC. 2343 // Thread A is resumed to finish this native method, but doesn't block here since it 2344 // didn't see any synchronization is progress, and escapes. 2345 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 2346 2347 // Force this write out before the read below 2348 if (!UseSystemMemoryBarrier) { 2349 __ membar(Assembler::Membar_mask_bits( 2350 Assembler::LoadLoad | Assembler::LoadStore | 2351 Assembler::StoreLoad | Assembler::StoreStore)); 2352 } 2353 2354 // check for safepoint operation in progress and/or pending suspend requests 2355 { 2356 Label Continue; 2357 Label slow_path; 2358 2359 __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */); 2360 2361 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2362 __ jcc(Assembler::equal, Continue); 2363 __ bind(slow_path); 2364 2365 // Don't use call_VM as it will see a possible pending exception and forward it 2366 // and never return here preventing us from clearing _last_native_pc down below. 2367 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are 2368 // preserved and correspond to the bcp/locals pointers. So we do a runtime call 2369 // by hand. 2370 // 2371 __ vzeroupper(); 2372 save_native_result(masm, ret_type, stack_slots); 2373 __ mov(c_rarg0, r15_thread); 2374 __ mov(r12, rsp); // remember sp 2375 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2376 __ andptr(rsp, -16); // align stack as required by ABI 2377 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 2378 __ mov(rsp, r12); // restore sp 2379 __ reinit_heapbase(); 2380 // Restore any method result value 2381 restore_native_result(masm, ret_type, stack_slots); 2382 __ bind(Continue); 2383 } 2384 2385 // change thread state 2386 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 2387 __ bind(after_transition); 2388 2389 // Check preemption for Object.wait() 2390 if (method->is_object_wait0()) { 2391 Label not_preempted; 2392 __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset())); 2393 __ cmpptr(rscratch1, NULL_WORD); 2394 __ jccb(Assembler::equal, not_preempted); 2395 __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD); 2396 __ jmp(rscratch1); 2397 __ bind(not_preempted); 2398 } 2399 int resume_wait_offset = ((intptr_t)__ pc()) - start; 2400 2401 Label reguard; 2402 Label reguard_done; 2403 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 2404 __ jcc(Assembler::equal, reguard); 2405 __ bind(reguard_done); 2406 2407 // native result if any is live 2408 2409 // Unlock 2410 Label slow_path_unlock; 2411 Label unlock_done; 2412 if (method->is_synchronized()) { 2413 2414 Label fast_done; 2415 2416 // Get locked oop from the handle we passed to jni 2417 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2418 2419 if (LockingMode == LM_LEGACY) { 2420 Label not_recur; 2421 // Simple recursive lock? 2422 __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD); 2423 __ jcc(Assembler::notEqual, not_recur); 2424 __ jmpb(fast_done); 2425 __ bind(not_recur); 2426 } 2427 2428 // Must save rax if it is live now because cmpxchg must use it 2429 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2430 save_native_result(masm, ret_type, stack_slots); 2431 } 2432 2433 if (LockingMode == LM_MONITOR) { 2434 __ jmp(slow_path_unlock); 2435 } else if (LockingMode == LM_LEGACY) { 2436 // get address of the stack lock 2437 __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2438 // get old displaced header 2439 __ movptr(old_hdr, Address(rax, 0)); 2440 2441 // Atomic swap old header if oop still contains the stack lock 2442 __ lock(); 2443 __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2444 __ jcc(Assembler::notEqual, slow_path_unlock); 2445 __ dec_held_monitor_count(); 2446 } else { 2447 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2448 __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock); 2449 } 2450 2451 // slow path re-enters here 2452 __ bind(unlock_done); 2453 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2454 restore_native_result(masm, ret_type, stack_slots); 2455 } 2456 2457 __ bind(fast_done); 2458 } 2459 if (DTraceMethodProbes) { 2460 save_native_result(masm, ret_type, stack_slots); 2461 __ mov_metadata(c_rarg1, method()); 2462 __ call_VM_leaf( 2463 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit), 2464 r15_thread, c_rarg1); 2465 restore_native_result(masm, ret_type, stack_slots); 2466 } 2467 2468 __ reset_last_Java_frame(false); 2469 2470 // Unbox oop result, e.g. JNIHandles::resolve value. 2471 if (is_reference_type(ret_type)) { 2472 __ resolve_jobject(rax /* value */, 2473 r15_thread /* thread */, 2474 rcx /* tmp */); 2475 } 2476 2477 if (CheckJNICalls) { 2478 // clear_pending_jni_exception_check 2479 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD); 2480 } 2481 2482 // reset handle block 2483 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset())); 2484 __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD); 2485 2486 // pop our frame 2487 2488 __ leave(); 2489 2490 // Any exception pending? 2491 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2492 __ jcc(Assembler::notEqual, exception_pending); 2493 2494 // Return 2495 2496 __ ret(0); 2497 2498 // Unexpected paths are out of line and go here 2499 2500 // forward the exception 2501 __ bind(exception_pending); 2502 2503 // and forward the exception 2504 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 2505 2506 // Slow path locking & unlocking 2507 if (method->is_synchronized()) { 2508 2509 // BEGIN Slow path lock 2510 __ bind(slow_path_lock); 2511 2512 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM 2513 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2514 2515 // protect the args we've loaded 2516 save_args(masm, total_c_args, c_arg, out_regs); 2517 2518 __ mov(c_rarg0, obj_reg); 2519 __ mov(c_rarg1, lock_reg); 2520 __ mov(c_rarg2, r15_thread); 2521 2522 // Not a leaf but we have last_Java_frame setup as we want 2523 // Force freeze slow path on ObjectMonitor::enter() for now which will fail with freeze_pinned_native. 2524 __ push_cont_fastpath(); 2525 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3); 2526 __ pop_cont_fastpath(); 2527 restore_args(masm, total_c_args, c_arg, out_regs); 2528 2529 #ifdef ASSERT 2530 { Label L; 2531 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2532 __ jcc(Assembler::equal, L); 2533 __ stop("no pending exception allowed on exit from monitorenter"); 2534 __ bind(L); 2535 } 2536 #endif 2537 __ jmp(lock_done); 2538 2539 // END Slow path lock 2540 2541 // BEGIN Slow path unlock 2542 __ bind(slow_path_unlock); 2543 2544 // If we haven't already saved the native result we must save it now as xmm registers 2545 // are still exposed. 2546 __ vzeroupper(); 2547 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2548 save_native_result(masm, ret_type, stack_slots); 2549 } 2550 2551 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2552 2553 __ mov(c_rarg0, obj_reg); 2554 __ mov(c_rarg2, r15_thread); 2555 __ mov(r12, rsp); // remember sp 2556 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2557 __ andptr(rsp, -16); // align stack as required by ABI 2558 2559 // Save pending exception around call to VM (which contains an EXCEPTION_MARK) 2560 // NOTE that obj_reg == rbx currently 2561 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset()))); 2562 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2563 2564 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2565 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C))); 2566 __ mov(rsp, r12); // restore sp 2567 __ reinit_heapbase(); 2568 #ifdef ASSERT 2569 { 2570 Label L; 2571 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2572 __ jcc(Assembler::equal, L); 2573 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C"); 2574 __ bind(L); 2575 } 2576 #endif /* ASSERT */ 2577 2578 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx); 2579 2580 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2581 restore_native_result(masm, ret_type, stack_slots); 2582 } 2583 __ jmp(unlock_done); 2584 2585 // END Slow path unlock 2586 2587 } // synchronized 2588 2589 // SLOW PATH Reguard the stack if needed 2590 2591 __ bind(reguard); 2592 __ vzeroupper(); 2593 save_native_result(masm, ret_type, stack_slots); 2594 __ mov(r12, rsp); // remember sp 2595 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2596 __ andptr(rsp, -16); // align stack as required by ABI 2597 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 2598 __ mov(rsp, r12); // restore sp 2599 __ reinit_heapbase(); 2600 restore_native_result(masm, ret_type, stack_slots); 2601 // and continue 2602 __ jmp(reguard_done); 2603 2604 2605 2606 __ flush(); 2607 2608 nmethod *nm = nmethod::new_native_nmethod(method, 2609 compile_id, 2610 masm->code(), 2611 vep_offset, 2612 frame_complete, 2613 stack_slots / VMRegImpl::slots_per_word, 2614 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)), 2615 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size), 2616 oop_maps); 2617 2618 if (nm != nullptr && method->is_object_wait0()) { 2619 SharedRuntime::set_native_frame_resume_entry(nm->code_begin() + resume_wait_offset); 2620 } 2621 2622 return nm; 2623 } 2624 2625 // this function returns the adjust size (in number of words) to a c2i adapter 2626 // activation for use during deoptimization 2627 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) { 2628 return (callee_locals - callee_parameters) * Interpreter::stackElementWords; 2629 } 2630 2631 2632 uint SharedRuntime::out_preserve_stack_slots() { 2633 return 0; 2634 } 2635 2636 2637 // Number of stack slots between incoming argument block and the start of 2638 // a new frame. The PROLOG must add this many slots to the stack. The 2639 // EPILOG must remove this many slots. amd64 needs two slots for 2640 // return address. 2641 uint SharedRuntime::in_preserve_stack_slots() { 2642 return 4 + 2 * VerifyStackAtCalls; 2643 } 2644 2645 VMReg SharedRuntime::thread_register() { 2646 return r15_thread->as_VMReg(); 2647 } 2648 2649 //------------------------------generate_deopt_blob---------------------------- 2650 void SharedRuntime::generate_deopt_blob() { 2651 // Allocate space for the code 2652 ResourceMark rm; 2653 // Setup code generation tools 2654 int pad = 0; 2655 if (UseAVX > 2) { 2656 pad += 1024; 2657 } 2658 if (UseAPX) { 2659 pad += 1024; 2660 } 2661 #if INCLUDE_JVMCI 2662 if (EnableJVMCI) { 2663 pad += 512; // Increase the buffer size when compiling for JVMCI 2664 } 2665 #endif 2666 CodeBuffer buffer("deopt_blob", 2560+pad, 1024); 2667 MacroAssembler* masm = new MacroAssembler(&buffer); 2668 int frame_size_in_words; 2669 OopMap* map = nullptr; 2670 OopMapSet *oop_maps = new OopMapSet(); 2671 2672 // ------------- 2673 // This code enters when returning to a de-optimized nmethod. A return 2674 // address has been pushed on the stack, and return values are in 2675 // registers. 2676 // If we are doing a normal deopt then we were called from the patched 2677 // nmethod from the point we returned to the nmethod. So the return 2678 // address on the stack is wrong by NativeCall::instruction_size 2679 // We will adjust the value so it looks like we have the original return 2680 // address on the stack (like when we eagerly deoptimized). 2681 // In the case of an exception pending when deoptimizing, we enter 2682 // with a return address on the stack that points after the call we patched 2683 // into the exception handler. We have the following register state from, 2684 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp). 2685 // rax: exception oop 2686 // rbx: exception handler 2687 // rdx: throwing pc 2688 // So in this case we simply jam rdx into the useless return address and 2689 // the stack looks just like we want. 2690 // 2691 // At this point we need to de-opt. We save the argument return 2692 // registers. We call the first C routine, fetch_unroll_info(). This 2693 // routine captures the return values and returns a structure which 2694 // describes the current frame size and the sizes of all replacement frames. 2695 // The current frame is compiled code and may contain many inlined 2696 // functions, each with their own JVM state. We pop the current frame, then 2697 // push all the new frames. Then we call the C routine unpack_frames() to 2698 // populate these frames. Finally unpack_frames() returns us the new target 2699 // address. Notice that callee-save registers are BLOWN here; they have 2700 // already been captured in the vframeArray at the time the return PC was 2701 // patched. 2702 address start = __ pc(); 2703 Label cont; 2704 2705 // Prolog for non exception case! 2706 2707 // Save everything in sight. 2708 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2709 2710 // Normal deoptimization. Save exec mode for unpack_frames. 2711 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved 2712 __ jmp(cont); 2713 2714 int reexecute_offset = __ pc() - start; 2715 #if INCLUDE_JVMCI && !defined(COMPILER1) 2716 if (EnableJVMCI && UseJVMCICompiler) { 2717 // JVMCI does not use this kind of deoptimization 2718 __ should_not_reach_here(); 2719 } 2720 #endif 2721 2722 // Reexecute case 2723 // return address is the pc describes what bci to do re-execute at 2724 2725 // No need to update map as each call to save_live_registers will produce identical oopmap 2726 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2727 2728 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved 2729 __ jmp(cont); 2730 2731 #if INCLUDE_JVMCI 2732 Label after_fetch_unroll_info_call; 2733 int implicit_exception_uncommon_trap_offset = 0; 2734 int uncommon_trap_offset = 0; 2735 2736 if (EnableJVMCI) { 2737 implicit_exception_uncommon_trap_offset = __ pc() - start; 2738 2739 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset()))); 2740 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD); 2741 2742 uncommon_trap_offset = __ pc() - start; 2743 2744 // Save everything in sight. 2745 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2746 // fetch_unroll_info needs to call last_java_frame() 2747 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2748 2749 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset()))); 2750 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1); 2751 2752 __ movl(r14, Deoptimization::Unpack_reexecute); 2753 __ mov(c_rarg0, r15_thread); 2754 __ movl(c_rarg2, r14); // exec mode 2755 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2756 oop_maps->add_gc_map( __ pc()-start, map->deep_copy()); 2757 2758 __ reset_last_Java_frame(false); 2759 2760 __ jmp(after_fetch_unroll_info_call); 2761 } // EnableJVMCI 2762 #endif // INCLUDE_JVMCI 2763 2764 int exception_offset = __ pc() - start; 2765 2766 // Prolog for exception case 2767 2768 // all registers are dead at this entry point, except for rax, and 2769 // rdx which contain the exception oop and exception pc 2770 // respectively. Set them in TLS and fall thru to the 2771 // unpack_with_exception_in_tls entry point. 2772 2773 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 2774 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax); 2775 2776 int exception_in_tls_offset = __ pc() - start; 2777 2778 // new implementation because exception oop is now passed in JavaThread 2779 2780 // Prolog for exception case 2781 // All registers must be preserved because they might be used by LinearScan 2782 // Exceptiop oop and throwing PC are passed in JavaThread 2783 // tos: stack at point of call to method that threw the exception (i.e. only 2784 // args are on the stack, no return address) 2785 2786 // make room on stack for the return address 2787 // It will be patched later with the throwing pc. The correct value is not 2788 // available now because loading it from memory would destroy registers. 2789 __ push(0); 2790 2791 // Save everything in sight. 2792 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2793 2794 // Now it is safe to overwrite any register 2795 2796 // Deopt during an exception. Save exec mode for unpack_frames. 2797 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved 2798 2799 // load throwing pc from JavaThread and patch it as the return address 2800 // of the current frame. Then clear the field in JavaThread 2801 2802 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2803 __ movptr(Address(rbp, wordSize), rdx); 2804 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2805 2806 #ifdef ASSERT 2807 // verify that there is really an exception oop in JavaThread 2808 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2809 __ verify_oop(rax); 2810 2811 // verify that there is no pending exception 2812 Label no_pending_exception; 2813 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 2814 __ testptr(rax, rax); 2815 __ jcc(Assembler::zero, no_pending_exception); 2816 __ stop("must not have pending exception here"); 2817 __ bind(no_pending_exception); 2818 #endif 2819 2820 __ bind(cont); 2821 2822 // Call C code. Need thread and this frame, but NOT official VM entry 2823 // crud. We cannot block on this call, no GC can happen. 2824 // 2825 // UnrollBlock* fetch_unroll_info(JavaThread* thread) 2826 2827 // fetch_unroll_info needs to call last_java_frame(). 2828 2829 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2830 #ifdef ASSERT 2831 { Label L; 2832 __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD); 2833 __ jcc(Assembler::equal, L); 2834 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared"); 2835 __ bind(L); 2836 } 2837 #endif // ASSERT 2838 __ mov(c_rarg0, r15_thread); 2839 __ movl(c_rarg1, r14); // exec_mode 2840 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info))); 2841 2842 // Need to have an oopmap that tells fetch_unroll_info where to 2843 // find any register it might need. 2844 oop_maps->add_gc_map(__ pc() - start, map); 2845 2846 __ reset_last_Java_frame(false); 2847 2848 #if INCLUDE_JVMCI 2849 if (EnableJVMCI) { 2850 __ bind(after_fetch_unroll_info_call); 2851 } 2852 #endif 2853 2854 // Load UnrollBlock* into rdi 2855 __ mov(rdi, rax); 2856 2857 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset())); 2858 Label noException; 2859 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending? 2860 __ jcc(Assembler::notEqual, noException); 2861 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2862 // QQQ this is useless it was null above 2863 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2864 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD); 2865 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2866 2867 __ verify_oop(rax); 2868 2869 // Overwrite the result registers with the exception results. 2870 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2871 // I think this is useless 2872 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx); 2873 2874 __ bind(noException); 2875 2876 // Only register save data is on the stack. 2877 // Now restore the result registers. Everything else is either dead 2878 // or captured in the vframeArray. 2879 RegisterSaver::restore_result_registers(masm); 2880 2881 // All of the register save area has been popped of the stack. Only the 2882 // return address remains. 2883 2884 // Pop all the frames we must move/replace. 2885 // 2886 // Frame picture (youngest to oldest) 2887 // 1: self-frame (no frame link) 2888 // 2: deopting frame (no frame link) 2889 // 3: caller of deopting frame (could be compiled/interpreted). 2890 // 2891 // Note: by leaving the return address of self-frame on the stack 2892 // and using the size of frame 2 to adjust the stack 2893 // when we are done the return to frame 3 will still be on the stack. 2894 2895 // Pop deoptimized frame 2896 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset())); 2897 __ addptr(rsp, rcx); 2898 2899 // rsp should be pointing at the return address to the caller (3) 2900 2901 // Pick up the initial fp we should save 2902 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2903 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset())); 2904 2905 #ifdef ASSERT 2906 // Compilers generate code that bang the stack by as much as the 2907 // interpreter would need. So this stack banging should never 2908 // trigger a fault. Verify that it does not on non product builds. 2909 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset())); 2910 __ bang_stack_size(rbx, rcx); 2911 #endif 2912 2913 // Load address of array of frame pcs into rcx 2914 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset())); 2915 2916 // Trash the old pc 2917 __ addptr(rsp, wordSize); 2918 2919 // Load address of array of frame sizes into rsi 2920 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset())); 2921 2922 // Load counter into rdx 2923 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset())); 2924 2925 // Now adjust the caller's stack to make up for the extra locals 2926 // but record the original sp so that we can save it in the skeletal interpreter 2927 // frame and the stack walking of interpreter_sender will get the unextended sp 2928 // value and not the "real" sp value. 2929 2930 const Register sender_sp = r8; 2931 2932 __ mov(sender_sp, rsp); 2933 __ movl(rbx, Address(rdi, 2934 Deoptimization::UnrollBlock:: 2935 caller_adjustment_offset())); 2936 __ subptr(rsp, rbx); 2937 2938 // Push interpreter frames in a loop 2939 Label loop; 2940 __ bind(loop); 2941 __ movptr(rbx, Address(rsi, 0)); // Load frame size 2942 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand 2943 __ pushptr(Address(rcx, 0)); // Save return address 2944 __ enter(); // Save old & set new ebp 2945 __ subptr(rsp, rbx); // Prolog 2946 // This value is corrected by layout_activation_impl 2947 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD); 2948 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable 2949 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 2950 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 2951 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 2952 __ decrementl(rdx); // Decrement counter 2953 __ jcc(Assembler::notZero, loop); 2954 __ pushptr(Address(rcx, 0)); // Save final return address 2955 2956 // Re-push self-frame 2957 __ enter(); // Save old & set new ebp 2958 2959 // Allocate a full sized register save area. 2960 // Return address and rbp are in place, so we allocate two less words. 2961 __ subptr(rsp, (frame_size_in_words - 2) * wordSize); 2962 2963 // Restore frame locals after moving the frame 2964 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0); 2965 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2966 2967 // Call C code. Need thread but NOT official VM entry 2968 // crud. We cannot block on this call, no GC can happen. Call should 2969 // restore return values to their stack-slots with the new SP. 2970 // 2971 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode) 2972 2973 // Use rbp because the frames look interpreted now 2974 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 2975 // Don't need the precise return PC here, just precise enough to point into this code blob. 2976 address the_pc = __ pc(); 2977 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1); 2978 2979 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI 2980 __ mov(c_rarg0, r15_thread); 2981 __ movl(c_rarg1, r14); // second arg: exec_mode 2982 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 2983 // Revert SP alignment after call since we're going to do some SP relative addressing below 2984 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset())); 2985 2986 // Set an oopmap for the call site 2987 // Use the same PC we used for the last java frame 2988 oop_maps->add_gc_map(the_pc - start, 2989 new OopMap( frame_size_in_words, 0 )); 2990 2991 // Clear fp AND pc 2992 __ reset_last_Java_frame(true); 2993 2994 // Collect return values 2995 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes())); 2996 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes())); 2997 // I think this is useless (throwing pc?) 2998 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes())); 2999 3000 // Pop self-frame. 3001 __ leave(); // Epilog 3002 3003 // Jump to interpreter 3004 __ ret(0); 3005 3006 // Make sure all code is generated 3007 masm->flush(); 3008 3009 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words); 3010 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset); 3011 #if INCLUDE_JVMCI 3012 if (EnableJVMCI) { 3013 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset); 3014 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset); 3015 } 3016 #endif 3017 } 3018 3019 #ifdef COMPILER2 3020 //------------------------------generate_uncommon_trap_blob-------------------- 3021 void SharedRuntime::generate_uncommon_trap_blob() { 3022 // Allocate space for the code 3023 ResourceMark rm; 3024 // Setup code generation tools 3025 CodeBuffer buffer("uncommon_trap_blob", 2048, 1024); 3026 MacroAssembler* masm = new MacroAssembler(&buffer); 3027 3028 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 3029 3030 address start = __ pc(); 3031 3032 // Push self-frame. We get here with a return address on the 3033 // stack, so rsp is 8-byte aligned until we allocate our frame. 3034 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog! 3035 3036 // No callee saved registers. rbp is assumed implicitly saved 3037 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 3038 3039 // compiler left unloaded_class_index in j_rarg0 move to where the 3040 // runtime expects it. 3041 __ movl(c_rarg1, j_rarg0); 3042 3043 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 3044 3045 // Call C code. Need thread but NOT official VM entry 3046 // crud. We cannot block on this call, no GC can happen. Call should 3047 // capture callee-saved registers as well as return values. 3048 // Thread is in rdi already. 3049 // 3050 // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index); 3051 3052 __ mov(c_rarg0, r15_thread); 3053 __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap); 3054 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 3055 3056 // Set an oopmap for the call site 3057 OopMapSet* oop_maps = new OopMapSet(); 3058 OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0); 3059 3060 // location of rbp is known implicitly by the frame sender code 3061 3062 oop_maps->add_gc_map(__ pc() - start, map); 3063 3064 __ reset_last_Java_frame(false); 3065 3066 // Load UnrollBlock* into rdi 3067 __ mov(rdi, rax); 3068 3069 #ifdef ASSERT 3070 { Label L; 3071 __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()), 3072 Deoptimization::Unpack_uncommon_trap); 3073 __ jcc(Assembler::equal, L); 3074 __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap"); 3075 __ bind(L); 3076 } 3077 #endif 3078 3079 // Pop all the frames we must move/replace. 3080 // 3081 // Frame picture (youngest to oldest) 3082 // 1: self-frame (no frame link) 3083 // 2: deopting frame (no frame link) 3084 // 3: caller of deopting frame (could be compiled/interpreted). 3085 3086 // Pop self-frame. We have no frame, and must rely only on rax and rsp. 3087 __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog! 3088 3089 // Pop deoptimized frame (int) 3090 __ movl(rcx, Address(rdi, 3091 Deoptimization::UnrollBlock:: 3092 size_of_deoptimized_frame_offset())); 3093 __ addptr(rsp, rcx); 3094 3095 // rsp should be pointing at the return address to the caller (3) 3096 3097 // Pick up the initial fp we should save 3098 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 3099 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset())); 3100 3101 #ifdef ASSERT 3102 // Compilers generate code that bang the stack by as much as the 3103 // interpreter would need. So this stack banging should never 3104 // trigger a fault. Verify that it does not on non product builds. 3105 __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset())); 3106 __ bang_stack_size(rbx, rcx); 3107 #endif 3108 3109 // Load address of array of frame pcs into rcx (address*) 3110 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset())); 3111 3112 // Trash the return pc 3113 __ addptr(rsp, wordSize); 3114 3115 // Load address of array of frame sizes into rsi (intptr_t*) 3116 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset())); 3117 3118 // Counter 3119 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset())); // (int) 3120 3121 // Now adjust the caller's stack to make up for the extra locals but 3122 // record the original sp so that we can save it in the skeletal 3123 // interpreter frame and the stack walking of interpreter_sender 3124 // will get the unextended sp value and not the "real" sp value. 3125 3126 const Register sender_sp = r8; 3127 3128 __ mov(sender_sp, rsp); 3129 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset())); // (int) 3130 __ subptr(rsp, rbx); 3131 3132 // Push interpreter frames in a loop 3133 Label loop; 3134 __ bind(loop); 3135 __ movptr(rbx, Address(rsi, 0)); // Load frame size 3136 __ subptr(rbx, 2 * wordSize); // We'll push pc and rbp by hand 3137 __ pushptr(Address(rcx, 0)); // Save return address 3138 __ enter(); // Save old & set new rbp 3139 __ subptr(rsp, rbx); // Prolog 3140 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), 3141 sender_sp); // Make it walkable 3142 // This value is corrected by layout_activation_impl 3143 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD); 3144 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 3145 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 3146 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 3147 __ decrementl(rdx); // Decrement counter 3148 __ jcc(Assembler::notZero, loop); 3149 __ pushptr(Address(rcx, 0)); // Save final return address 3150 3151 // Re-push self-frame 3152 __ enter(); // Save old & set new rbp 3153 __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt); 3154 // Prolog 3155 3156 // Use rbp because the frames look interpreted now 3157 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 3158 // Don't need the precise return PC here, just precise enough to point into this code blob. 3159 address the_pc = __ pc(); 3160 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1); 3161 3162 // Call C code. Need thread but NOT official VM entry 3163 // crud. We cannot block on this call, no GC can happen. Call should 3164 // restore return values to their stack-slots with the new SP. 3165 // Thread is in rdi already. 3166 // 3167 // BasicType unpack_frames(JavaThread* thread, int exec_mode); 3168 3169 __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI 3170 __ mov(c_rarg0, r15_thread); 3171 __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap); 3172 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 3173 3174 // Set an oopmap for the call site 3175 // Use the same PC we used for the last java frame 3176 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 3177 3178 // Clear fp AND pc 3179 __ reset_last_Java_frame(true); 3180 3181 // Pop self-frame. 3182 __ leave(); // Epilog 3183 3184 // Jump to interpreter 3185 __ ret(0); 3186 3187 // Make sure all code is generated 3188 masm->flush(); 3189 3190 _uncommon_trap_blob = UncommonTrapBlob::create(&buffer, oop_maps, 3191 SimpleRuntimeFrame::framesize >> 1); 3192 } 3193 #endif // COMPILER2 3194 3195 //------------------------------generate_handler_blob------ 3196 // 3197 // Generate a special Compile2Runtime blob that saves all registers, 3198 // and setup oopmap. 3199 // 3200 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) { 3201 assert(StubRoutines::forward_exception_entry() != nullptr, 3202 "must be generated before"); 3203 3204 ResourceMark rm; 3205 OopMapSet *oop_maps = new OopMapSet(); 3206 OopMap* map; 3207 3208 // Allocate space for the code. Setup code generation tools. 3209 CodeBuffer buffer("handler_blob", 2348, 1024); 3210 MacroAssembler* masm = new MacroAssembler(&buffer); 3211 3212 address start = __ pc(); 3213 address call_pc = nullptr; 3214 int frame_size_in_words; 3215 bool cause_return = (poll_type == POLL_AT_RETURN); 3216 bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP); 3217 3218 // Make room for return address (or push it again) 3219 if (!cause_return) { 3220 __ push(rbx); 3221 } 3222 3223 // Save registers, fpu state, and flags 3224 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors); 3225 3226 // The following is basically a call_VM. However, we need the precise 3227 // address of the call in order to generate an oopmap. Hence, we do all the 3228 // work ourselves. 3229 3230 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next: 3231 3232 // The return address must always be correct so that frame constructor never 3233 // sees an invalid pc. 3234 3235 if (!cause_return) { 3236 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack. 3237 // Additionally, rbx is a callee saved register and we can look at it later to determine 3238 // if someone changed the return address for us! 3239 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset())); 3240 __ movptr(Address(rbp, wordSize), rbx); 3241 } 3242 3243 // Do the call 3244 __ mov(c_rarg0, r15_thread); 3245 __ call(RuntimeAddress(call_ptr)); 3246 3247 // Set an oopmap for the call site. This oopmap will map all 3248 // oop-registers and debug-info registers as callee-saved. This 3249 // will allow deoptimization at this safepoint to find all possible 3250 // debug-info recordings, as well as let GC find all oops. 3251 3252 oop_maps->add_gc_map( __ pc() - start, map); 3253 3254 Label noException; 3255 3256 __ reset_last_Java_frame(false); 3257 3258 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3259 __ jcc(Assembler::equal, noException); 3260 3261 // Exception pending 3262 3263 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3264 3265 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3266 3267 // No exception case 3268 __ bind(noException); 3269 3270 Label no_adjust; 3271 #ifdef ASSERT 3272 Label bail; 3273 #endif 3274 if (!cause_return) { 3275 Label no_prefix, not_special; 3276 3277 // If our stashed return pc was modified by the runtime we avoid touching it 3278 __ cmpptr(rbx, Address(rbp, wordSize)); 3279 __ jccb(Assembler::notEqual, no_adjust); 3280 3281 // Skip over the poll instruction. 3282 // See NativeInstruction::is_safepoint_poll() 3283 // Possible encodings: 3284 // 85 00 test %eax,(%rax) 3285 // 85 01 test %eax,(%rcx) 3286 // 85 02 test %eax,(%rdx) 3287 // 85 03 test %eax,(%rbx) 3288 // 85 06 test %eax,(%rsi) 3289 // 85 07 test %eax,(%rdi) 3290 // 3291 // 41 85 00 test %eax,(%r8) 3292 // 41 85 01 test %eax,(%r9) 3293 // 41 85 02 test %eax,(%r10) 3294 // 41 85 03 test %eax,(%r11) 3295 // 41 85 06 test %eax,(%r14) 3296 // 41 85 07 test %eax,(%r15) 3297 // 3298 // 85 04 24 test %eax,(%rsp) 3299 // 41 85 04 24 test %eax,(%r12) 3300 // 85 45 00 test %eax,0x0(%rbp) 3301 // 41 85 45 00 test %eax,0x0(%r13) 3302 3303 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix); 3304 __ jcc(Assembler::notEqual, no_prefix); 3305 __ addptr(rbx, 1); 3306 __ bind(no_prefix); 3307 #ifdef ASSERT 3308 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below 3309 #endif 3310 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values: 3311 // r12/rsp 0x04 3312 // r13/rbp 0x05 3313 __ movzbq(rcx, Address(rbx, 1)); 3314 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05 3315 __ subptr(rcx, 4); // looking for 0x00 .. 0x01 3316 __ cmpptr(rcx, 1); 3317 __ jcc(Assembler::above, not_special); 3318 __ addptr(rbx, 1); 3319 __ bind(not_special); 3320 #ifdef ASSERT 3321 // Verify the correct encoding of the poll we're about to skip. 3322 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl); 3323 __ jcc(Assembler::notEqual, bail); 3324 // Mask out the modrm bits 3325 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask); 3326 // rax encodes to 0, so if the bits are nonzero it's incorrect 3327 __ jcc(Assembler::notZero, bail); 3328 #endif 3329 // Adjust return pc forward to step over the safepoint poll instruction 3330 __ addptr(rbx, 2); 3331 __ movptr(Address(rbp, wordSize), rbx); 3332 } 3333 3334 __ bind(no_adjust); 3335 // Normal exit, restore registers and exit. 3336 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3337 __ ret(0); 3338 3339 #ifdef ASSERT 3340 __ bind(bail); 3341 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected"); 3342 #endif 3343 3344 // Make sure all code is generated 3345 masm->flush(); 3346 3347 // Fill-out other meta info 3348 return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words); 3349 } 3350 3351 // 3352 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss 3353 // 3354 // Generate a stub that calls into vm to find out the proper destination 3355 // of a java call. All the argument registers are live at this point 3356 // but since this is generic code we don't know what they are and the caller 3357 // must do any gc of the args. 3358 // 3359 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) { 3360 assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before"); 3361 3362 // allocate space for the code 3363 ResourceMark rm; 3364 3365 CodeBuffer buffer(name, 1552, 512); 3366 MacroAssembler* masm = new MacroAssembler(&buffer); 3367 3368 int frame_size_in_words; 3369 3370 OopMapSet *oop_maps = new OopMapSet(); 3371 OopMap* map = nullptr; 3372 3373 int start = __ offset(); 3374 3375 // No need to save vector registers since they are caller-saved anyway. 3376 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false); 3377 3378 int frame_complete = __ offset(); 3379 3380 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 3381 3382 __ mov(c_rarg0, r15_thread); 3383 3384 __ call(RuntimeAddress(destination)); 3385 3386 3387 // Set an oopmap for the call site. 3388 // We need this not only for callee-saved registers, but also for volatile 3389 // registers that the compiler might be keeping live across a safepoint. 3390 3391 oop_maps->add_gc_map( __ offset() - start, map); 3392 3393 // rax contains the address we are going to jump to assuming no exception got installed 3394 3395 // clear last_Java_sp 3396 __ reset_last_Java_frame(false); 3397 // check for pending exceptions 3398 Label pending; 3399 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3400 __ jcc(Assembler::notEqual, pending); 3401 3402 // get the returned Method* 3403 __ get_vm_result_2(rbx, r15_thread); 3404 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx); 3405 3406 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 3407 3408 RegisterSaver::restore_live_registers(masm); 3409 3410 // We are back to the original state on entry and ready to go. 3411 3412 __ jmp(rax); 3413 3414 // Pending exception after the safepoint 3415 3416 __ bind(pending); 3417 3418 RegisterSaver::restore_live_registers(masm); 3419 3420 // exception pending => remove activation and forward to exception handler 3421 3422 __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD); 3423 3424 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 3425 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3426 3427 // ------------- 3428 // make sure all code is generated 3429 masm->flush(); 3430 3431 // return the blob 3432 // frame_size_words or bytes?? 3433 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); 3434 } 3435 3436 //------------------------------Montgomery multiplication------------------------ 3437 // 3438 3439 #ifndef _WINDOWS 3440 3441 // Subtract 0:b from carry:a. Return carry. 3442 static julong 3443 sub(julong a[], julong b[], julong carry, long len) { 3444 long long i = 0, cnt = len; 3445 julong tmp; 3446 asm volatile("clc; " 3447 "0: ; " 3448 "mov (%[b], %[i], 8), %[tmp]; " 3449 "sbb %[tmp], (%[a], %[i], 8); " 3450 "inc %[i]; dec %[cnt]; " 3451 "jne 0b; " 3452 "mov %[carry], %[tmp]; sbb $0, %[tmp]; " 3453 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp) 3454 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry) 3455 : "memory"); 3456 return tmp; 3457 } 3458 3459 // Multiply (unsigned) Long A by Long B, accumulating the double- 3460 // length result into the accumulator formed of T0, T1, and T2. 3461 #define MACC(A, B, T0, T1, T2) \ 3462 do { \ 3463 unsigned long hi, lo; \ 3464 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3465 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3466 : "r"(A), "a"(B) : "cc"); \ 3467 } while(0) 3468 3469 // As above, but add twice the double-length result into the 3470 // accumulator. 3471 #define MACC2(A, B, T0, T1, T2) \ 3472 do { \ 3473 unsigned long hi, lo; \ 3474 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \ 3475 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3476 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3477 : "r"(A), "a"(B) : "cc"); \ 3478 } while(0) 3479 3480 #else //_WINDOWS 3481 3482 static julong 3483 sub(julong a[], julong b[], julong carry, long len) { 3484 long i; 3485 julong tmp; 3486 unsigned char c = 1; 3487 for (i = 0; i < len; i++) { 3488 c = _addcarry_u64(c, a[i], ~b[i], &tmp); 3489 a[i] = tmp; 3490 } 3491 c = _addcarry_u64(c, carry, ~0, &tmp); 3492 return tmp; 3493 } 3494 3495 // Multiply (unsigned) Long A by Long B, accumulating the double- 3496 // length result into the accumulator formed of T0, T1, and T2. 3497 #define MACC(A, B, T0, T1, T2) \ 3498 do { \ 3499 julong hi, lo; \ 3500 lo = _umul128(A, B, &hi); \ 3501 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3502 c = _addcarry_u64(c, hi, T1, &T1); \ 3503 _addcarry_u64(c, T2, 0, &T2); \ 3504 } while(0) 3505 3506 // As above, but add twice the double-length result into the 3507 // accumulator. 3508 #define MACC2(A, B, T0, T1, T2) \ 3509 do { \ 3510 julong hi, lo; \ 3511 lo = _umul128(A, B, &hi); \ 3512 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3513 c = _addcarry_u64(c, hi, T1, &T1); \ 3514 _addcarry_u64(c, T2, 0, &T2); \ 3515 c = _addcarry_u64(0, lo, T0, &T0); \ 3516 c = _addcarry_u64(c, hi, T1, &T1); \ 3517 _addcarry_u64(c, T2, 0, &T2); \ 3518 } while(0) 3519 3520 #endif //_WINDOWS 3521 3522 // Fast Montgomery multiplication. The derivation of the algorithm is 3523 // in A Cryptographic Library for the Motorola DSP56000, 3524 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3525 3526 static void NOINLINE 3527 montgomery_multiply(julong a[], julong b[], julong n[], 3528 julong m[], julong inv, int len) { 3529 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3530 int i; 3531 3532 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply"); 3533 3534 for (i = 0; i < len; i++) { 3535 int j; 3536 for (j = 0; j < i; j++) { 3537 MACC(a[j], b[i-j], t0, t1, t2); 3538 MACC(m[j], n[i-j], t0, t1, t2); 3539 } 3540 MACC(a[i], b[0], t0, t1, t2); 3541 m[i] = t0 * inv; 3542 MACC(m[i], n[0], t0, t1, t2); 3543 3544 assert(t0 == 0, "broken Montgomery multiply"); 3545 3546 t0 = t1; t1 = t2; t2 = 0; 3547 } 3548 3549 for (i = len; i < 2*len; i++) { 3550 int j; 3551 for (j = i-len+1; j < len; j++) { 3552 MACC(a[j], b[i-j], t0, t1, t2); 3553 MACC(m[j], n[i-j], t0, t1, t2); 3554 } 3555 m[i-len] = t0; 3556 t0 = t1; t1 = t2; t2 = 0; 3557 } 3558 3559 while (t0) 3560 t0 = sub(m, n, t0, len); 3561 } 3562 3563 // Fast Montgomery squaring. This uses asymptotically 25% fewer 3564 // multiplies so it should be up to 25% faster than Montgomery 3565 // multiplication. However, its loop control is more complex and it 3566 // may actually run slower on some machines. 3567 3568 static void NOINLINE 3569 montgomery_square(julong a[], julong n[], 3570 julong m[], julong inv, int len) { 3571 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3572 int i; 3573 3574 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square"); 3575 3576 for (i = 0; i < len; i++) { 3577 int j; 3578 int end = (i+1)/2; 3579 for (j = 0; j < end; j++) { 3580 MACC2(a[j], a[i-j], t0, t1, t2); 3581 MACC(m[j], n[i-j], t0, t1, t2); 3582 } 3583 if ((i & 1) == 0) { 3584 MACC(a[j], a[j], t0, t1, t2); 3585 } 3586 for (; j < i; j++) { 3587 MACC(m[j], n[i-j], t0, t1, t2); 3588 } 3589 m[i] = t0 * inv; 3590 MACC(m[i], n[0], t0, t1, t2); 3591 3592 assert(t0 == 0, "broken Montgomery square"); 3593 3594 t0 = t1; t1 = t2; t2 = 0; 3595 } 3596 3597 for (i = len; i < 2*len; i++) { 3598 int start = i-len+1; 3599 int end = start + (len - start)/2; 3600 int j; 3601 for (j = start; j < end; j++) { 3602 MACC2(a[j], a[i-j], t0, t1, t2); 3603 MACC(m[j], n[i-j], t0, t1, t2); 3604 } 3605 if ((i & 1) == 0) { 3606 MACC(a[j], a[j], t0, t1, t2); 3607 } 3608 for (; j < len; j++) { 3609 MACC(m[j], n[i-j], t0, t1, t2); 3610 } 3611 m[i-len] = t0; 3612 t0 = t1; t1 = t2; t2 = 0; 3613 } 3614 3615 while (t0) 3616 t0 = sub(m, n, t0, len); 3617 } 3618 3619 // Swap words in a longword. 3620 static julong swap(julong x) { 3621 return (x << 32) | (x >> 32); 3622 } 3623 3624 // Copy len longwords from s to d, word-swapping as we go. The 3625 // destination array is reversed. 3626 static void reverse_words(julong *s, julong *d, int len) { 3627 d += len; 3628 while(len-- > 0) { 3629 d--; 3630 *d = swap(*s); 3631 s++; 3632 } 3633 } 3634 3635 // The threshold at which squaring is advantageous was determined 3636 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. 3637 #define MONTGOMERY_SQUARING_THRESHOLD 64 3638 3639 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, 3640 jint len, jlong inv, 3641 jint *m_ints) { 3642 assert(len % 2 == 0, "array length in montgomery_multiply must be even"); 3643 int longwords = len/2; 3644 3645 // Make very sure we don't use so much space that the stack might 3646 // overflow. 512 jints corresponds to an 16384-bit integer and 3647 // will use here a total of 8k bytes of stack space. 3648 int divisor = sizeof(julong) * 4; 3649 guarantee(longwords <= 8192 / divisor, "must be"); 3650 int total_allocation = longwords * sizeof (julong) * 4; 3651 julong *scratch = (julong *)alloca(total_allocation); 3652 3653 // Local scratch arrays 3654 julong 3655 *a = scratch + 0 * longwords, 3656 *b = scratch + 1 * longwords, 3657 *n = scratch + 2 * longwords, 3658 *m = scratch + 3 * longwords; 3659 3660 reverse_words((julong *)a_ints, a, longwords); 3661 reverse_words((julong *)b_ints, b, longwords); 3662 reverse_words((julong *)n_ints, n, longwords); 3663 3664 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords); 3665 3666 reverse_words(m, (julong *)m_ints, longwords); 3667 } 3668 3669 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, 3670 jint len, jlong inv, 3671 jint *m_ints) { 3672 assert(len % 2 == 0, "array length in montgomery_square must be even"); 3673 int longwords = len/2; 3674 3675 // Make very sure we don't use so much space that the stack might 3676 // overflow. 512 jints corresponds to an 16384-bit integer and 3677 // will use here a total of 6k bytes of stack space. 3678 int divisor = sizeof(julong) * 3; 3679 guarantee(longwords <= (8192 / divisor), "must be"); 3680 int total_allocation = longwords * sizeof (julong) * 3; 3681 julong *scratch = (julong *)alloca(total_allocation); 3682 3683 // Local scratch arrays 3684 julong 3685 *a = scratch + 0 * longwords, 3686 *n = scratch + 1 * longwords, 3687 *m = scratch + 2 * longwords; 3688 3689 reverse_words((julong *)a_ints, a, longwords); 3690 reverse_words((julong *)n_ints, n, longwords); 3691 3692 if (len >= MONTGOMERY_SQUARING_THRESHOLD) { 3693 ::montgomery_square(a, n, m, (julong)inv, longwords); 3694 } else { 3695 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords); 3696 } 3697 3698 reverse_words(m, (julong *)m_ints, longwords); 3699 } 3700 3701 #ifdef COMPILER2 3702 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame 3703 // 3704 //------------------------------generate_exception_blob--------------------------- 3705 // creates exception blob at the end 3706 // Using exception blob, this code is jumped from a compiled method. 3707 // (see emit_exception_handler in x86_64.ad file) 3708 // 3709 // Given an exception pc at a call we call into the runtime for the 3710 // handler in this method. This handler might merely restore state 3711 // (i.e. callee save registers) unwind the frame and jump to the 3712 // exception handler for the nmethod if there is no Java level handler 3713 // for the nmethod. 3714 // 3715 // This code is entered with a jmp. 3716 // 3717 // Arguments: 3718 // rax: exception oop 3719 // rdx: exception pc 3720 // 3721 // Results: 3722 // rax: exception oop 3723 // rdx: exception pc in caller or ??? 3724 // destination: exception handler of caller 3725 // 3726 // Note: the exception pc MUST be at a call (precise debug information) 3727 // Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved. 3728 // 3729 3730 void OptoRuntime::generate_exception_blob() { 3731 assert(!OptoRuntime::is_callee_saved_register(RDX_num), ""); 3732 assert(!OptoRuntime::is_callee_saved_register(RAX_num), ""); 3733 assert(!OptoRuntime::is_callee_saved_register(RCX_num), ""); 3734 3735 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 3736 3737 // Allocate space for the code 3738 ResourceMark rm; 3739 // Setup code generation tools 3740 CodeBuffer buffer("exception_blob", 2048, 1024); 3741 MacroAssembler* masm = new MacroAssembler(&buffer); 3742 3743 3744 address start = __ pc(); 3745 3746 // Exception pc is 'return address' for stack walker 3747 __ push(rdx); 3748 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog 3749 3750 // Save callee-saved registers. See x86_64.ad. 3751 3752 // rbp is an implicitly saved callee saved register (i.e., the calling 3753 // convention will save/restore it in the prolog/epilog). Other than that 3754 // there are no callee save registers now that adapter frames are gone. 3755 3756 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 3757 3758 // Store exception in Thread object. We cannot pass any arguments to the 3759 // handle_exception call, since we do not want to make any assumption 3760 // about the size of the frame where the exception happened in. 3761 // c_rarg0 is either rdi (Linux) or rcx (Windows). 3762 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax); 3763 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 3764 3765 // This call does all the hard work. It checks if an exception handler 3766 // exists in the method. 3767 // If so, it returns the handler address. 3768 // If not, it prepares for stack-unwinding, restoring the callee-save 3769 // registers of the frame being removed. 3770 // 3771 // address OptoRuntime::handle_exception_C(JavaThread* thread) 3772 3773 // At a method handle call, the stack may not be properly aligned 3774 // when returning with an exception. 3775 address the_pc = __ pc(); 3776 __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1); 3777 __ mov(c_rarg0, r15_thread); 3778 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack 3779 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C))); 3780 3781 // Set an oopmap for the call site. This oopmap will only be used if we 3782 // are unwinding the stack. Hence, all locations will be dead. 3783 // Callee-saved registers will be the same as the frame above (i.e., 3784 // handle_exception_stub), since they were restored when we got the 3785 // exception. 3786 3787 OopMapSet* oop_maps = new OopMapSet(); 3788 3789 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 3790 3791 __ reset_last_Java_frame(false); 3792 3793 // Restore callee-saved registers 3794 3795 // rbp is an implicitly saved callee-saved register (i.e., the calling 3796 // convention will save restore it in prolog/epilog) Other than that 3797 // there are no callee save registers now that adapter frames are gone. 3798 3799 __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt)); 3800 3801 __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog 3802 __ pop(rdx); // No need for exception pc anymore 3803 3804 // rax: exception handler 3805 3806 // We have a handler in rax (could be deopt blob). 3807 __ mov(r8, rax); 3808 3809 // Get the exception oop 3810 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 3811 // Get the exception pc in case we are deoptimized 3812 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 3813 #ifdef ASSERT 3814 __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD); 3815 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 3816 #endif 3817 // Clear the exception oop so GC no longer processes it as a root. 3818 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD); 3819 3820 // rax: exception oop 3821 // r8: exception handler 3822 // rdx: exception pc 3823 // Jump to handler 3824 3825 __ jmp(r8); 3826 3827 // Make sure all code is generated 3828 masm->flush(); 3829 3830 // Set exception blob 3831 _exception_blob = ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1); 3832 } 3833 #endif // COMPILER2