1 /* 2 * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #ifndef _WINDOWS 27 #include "alloca.h" 28 #endif 29 #include "asm/macroAssembler.hpp" 30 #include "asm/macroAssembler.inline.hpp" 31 #include "code/compiledIC.hpp" 32 #include "code/debugInfoRec.hpp" 33 #include "code/nativeInst.hpp" 34 #include "code/SCCache.hpp" 35 #include "code/vtableStubs.hpp" 36 #include "compiler/oopMap.hpp" 37 #include "gc/shared/collectedHeap.hpp" 38 #include "gc/shared/gcLocker.hpp" 39 #include "gc/shared/barrierSet.hpp" 40 #include "gc/shared/barrierSetAssembler.hpp" 41 #include "interpreter/interpreter.hpp" 42 #include "logging/log.hpp" 43 #include "memory/resourceArea.hpp" 44 #include "memory/universe.hpp" 45 #include "oops/klass.inline.hpp" 46 #include "oops/method.inline.hpp" 47 #include "prims/methodHandles.hpp" 48 #include "runtime/continuation.hpp" 49 #include "runtime/continuationEntry.inline.hpp" 50 #include "runtime/globals.hpp" 51 #include "runtime/jniHandles.hpp" 52 #include "runtime/safepointMechanism.hpp" 53 #include "runtime/sharedRuntime.hpp" 54 #include "runtime/signature.hpp" 55 #include "runtime/stubRoutines.hpp" 56 #include "runtime/vframeArray.hpp" 57 #include "runtime/vm_version.hpp" 58 #include "utilities/align.hpp" 59 #include "utilities/checkedCast.hpp" 60 #include "utilities/formatBuffer.hpp" 61 #include "vmreg_x86.inline.hpp" 62 #ifdef COMPILER1 63 #include "c1/c1_Runtime1.hpp" 64 #endif 65 #ifdef COMPILER2 66 #include "opto/runtime.hpp" 67 #endif 68 #if INCLUDE_JVMCI 69 #include "jvmci/jvmciJavaClasses.hpp" 70 #endif 71 72 #define __ masm-> 73 74 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size; 75 76 class SimpleRuntimeFrame { 77 78 public: 79 80 // Most of the runtime stubs have this simple frame layout. 81 // This class exists to make the layout shared in one place. 82 // Offsets are for compiler stack slots, which are jints. 83 enum layout { 84 // The frame sender code expects that rbp will be in the "natural" place and 85 // will override any oopMap setting for it. We must therefore force the layout 86 // so that it agrees with the frame sender code. 87 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt, 88 rbp_off2, 89 return_off, return_off2, 90 framesize 91 }; 92 }; 93 94 class RegisterSaver { 95 // Capture info about frame layout. Layout offsets are in jint 96 // units because compiler frame slots are jints. 97 #define XSAVE_AREA_BEGIN 160 98 #define XSAVE_AREA_YMM_BEGIN 576 99 #define XSAVE_AREA_EGPRS 960 100 #define XSAVE_AREA_OPMASK_BEGIN 1088 101 #define XSAVE_AREA_ZMM_BEGIN 1152 102 #define XSAVE_AREA_UPPERBANK 1664 103 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off 104 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off 105 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off 106 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off 107 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off 108 enum layout { 109 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area 110 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area 111 DEF_XMM_OFFS(0), 112 DEF_XMM_OFFS(1), 113 // 2..15 are implied in range usage 114 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 115 DEF_YMM_OFFS(0), 116 DEF_YMM_OFFS(1), 117 // 2..15 are implied in range usage 118 r31_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt, 119 r31H_off, 120 r30_off, r30H_off, 121 r29_off, r29H_off, 122 r28_off, r28H_off, 123 r27_off, r27H_off, 124 r26_off, r26H_off, 125 r25_off, r25H_off, 126 r24_off, r24H_off, 127 r23_off, r23H_off, 128 r22_off, r22H_off, 129 r21_off, r21H_off, 130 r20_off, r20H_off, 131 r19_off, r19H_off, 132 r18_off, r18H_off, 133 r17_off, r17H_off, 134 r16_off, r16H_off, 135 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 136 DEF_OPMASK_OFFS(0), 137 DEF_OPMASK_OFFS(1), 138 // 2..7 are implied in range usage 139 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 140 DEF_ZMM_OFFS(0), 141 DEF_ZMM_OFFS(1), 142 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt, 143 DEF_ZMM_UPPER_OFFS(16), 144 DEF_ZMM_UPPER_OFFS(17), 145 // 18..31 are implied in range usage 146 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt), 147 fpu_stateH_end, 148 r15_off, r15H_off, 149 r14_off, r14H_off, 150 r13_off, r13H_off, 151 r12_off, r12H_off, 152 r11_off, r11H_off, 153 r10_off, r10H_off, 154 r9_off, r9H_off, 155 r8_off, r8H_off, 156 rdi_off, rdiH_off, 157 rsi_off, rsiH_off, 158 ignore_off, ignoreH_off, // extra copy of rbp 159 rsp_off, rspH_off, 160 rbx_off, rbxH_off, 161 rdx_off, rdxH_off, 162 rcx_off, rcxH_off, 163 rax_off, raxH_off, 164 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state 165 align_off, alignH_off, 166 flags_off, flagsH_off, 167 // The frame sender code expects that rbp will be in the "natural" place and 168 // will override any oopMap setting for it. We must therefore force the layout 169 // so that it agrees with the frame sender code. 170 rbp_off, rbpH_off, // copy of rbp we will restore 171 return_off, returnH_off, // slot for return address 172 reg_save_size // size in compiler stack slots 173 }; 174 175 public: 176 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors); 177 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false); 178 179 // Offsets into the register save area 180 // Used by deoptimization when it is managing result register 181 // values on its own 182 183 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; } 184 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; } 185 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; } 186 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; } 187 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; } 188 189 // During deoptimization only the result registers need to be restored, 190 // all the other values have already been extracted. 191 static void restore_result_registers(MacroAssembler* masm); 192 }; 193 194 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) { 195 int off = 0; 196 int num_xmm_regs = XMMRegister::available_xmm_registers(); 197 #if COMPILER2_OR_JVMCI 198 if (save_wide_vectors && UseAVX == 0) { 199 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX 200 } 201 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 202 #else 203 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI 204 #endif 205 206 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated 207 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs); 208 // OopMap frame size is in compiler stack slots (jint's) not bytes or words 209 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 210 // CodeBlob frame size is in words. 211 int frame_size_in_words = frame_size_in_bytes / wordSize; 212 *total_frame_words = frame_size_in_words; 213 214 // Save registers, fpu state, and flags. 215 // We assume caller has already pushed the return address onto the 216 // stack, so rsp is 8-byte aligned here. 217 // We push rpb twice in this sequence because we want the real rbp 218 // to be under the return like a normal enter. 219 220 __ enter(); // rsp becomes 16-byte aligned here 221 __ pushf(); 222 // Make sure rsp stays 16-byte aligned 223 __ subq(rsp, 8); 224 // Push CPU state in multiple of 16 bytes 225 __ save_legacy_gprs(); 226 __ push_FPU_state(); 227 228 229 // push cpu state handles this on EVEX enabled targets 230 if (save_wide_vectors) { 231 // Save upper half of YMM registers(0..15) 232 int base_addr = XSAVE_AREA_YMM_BEGIN; 233 for (int n = 0; n < 16; n++) { 234 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n)); 235 } 236 if (VM_Version::supports_evex()) { 237 // Save upper half of ZMM registers(0..15) 238 base_addr = XSAVE_AREA_ZMM_BEGIN; 239 for (int n = 0; n < 16; n++) { 240 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n)); 241 } 242 // Save full ZMM registers(16..num_xmm_regs) 243 base_addr = XSAVE_AREA_UPPERBANK; 244 off = 0; 245 int vector_len = Assembler::AVX_512bit; 246 for (int n = 16; n < num_xmm_regs; n++) { 247 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 248 } 249 #if COMPILER2_OR_JVMCI 250 base_addr = XSAVE_AREA_OPMASK_BEGIN; 251 off = 0; 252 for(int n = 0; n < KRegister::number_of_registers; n++) { 253 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 254 } 255 #endif 256 } 257 } else { 258 if (VM_Version::supports_evex()) { 259 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 260 int base_addr = XSAVE_AREA_UPPERBANK; 261 off = 0; 262 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 263 for (int n = 16; n < num_xmm_regs; n++) { 264 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 265 } 266 #if COMPILER2_OR_JVMCI 267 base_addr = XSAVE_AREA_OPMASK_BEGIN; 268 off = 0; 269 for(int n = 0; n < KRegister::number_of_registers; n++) { 270 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 271 } 272 #endif 273 } 274 } 275 276 #if COMPILER2_OR_JVMCI 277 if (UseAPX) { 278 int base_addr = XSAVE_AREA_EGPRS; 279 off = 0; 280 for(int n = 16; n < Register::number_of_registers; n++) { 281 __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n)); 282 } 283 } 284 #endif 285 286 __ vzeroupper(); 287 if (frame::arg_reg_save_area_bytes != 0) { 288 // Allocate argument register save area 289 __ subptr(rsp, frame::arg_reg_save_area_bytes); 290 } 291 292 // Set an oopmap for the call site. This oopmap will map all 293 // oop-registers and debug-info registers as callee-saved. This 294 // will allow deoptimization at this safepoint to find all possible 295 // debug-info recordings, as well as let GC find all oops. 296 297 OopMapSet *oop_maps = new OopMapSet(); 298 OopMap* map = new OopMap(frame_size_in_slots, 0); 299 300 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x)) 301 302 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg()); 303 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg()); 304 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg()); 305 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg()); 306 // rbp location is known implicitly by the frame sender code, needs no oopmap 307 // and the location where rbp was saved by is ignored 308 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg()); 309 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg()); 310 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg()); 311 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg()); 312 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg()); 313 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg()); 314 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg()); 315 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg()); 316 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg()); 317 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg()); 318 319 if (UseAPX) { 320 map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg()); 321 map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg()); 322 map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg()); 323 map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg()); 324 map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg()); 325 map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg()); 326 map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg()); 327 map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg()); 328 map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg()); 329 map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg()); 330 map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg()); 331 map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg()); 332 map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg()); 333 map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg()); 334 map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg()); 335 map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg()); 336 } 337 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 338 // on EVEX enabled targets, we get it included in the xsave area 339 off = xmm0_off; 340 int delta = xmm1_off - off; 341 for (int n = 0; n < 16; n++) { 342 XMMRegister xmm_name = as_XMMRegister(n); 343 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()); 344 off += delta; 345 } 346 if (UseAVX > 2) { 347 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 348 off = zmm16_off; 349 delta = zmm17_off - off; 350 for (int n = 16; n < num_xmm_regs; n++) { 351 XMMRegister zmm_name = as_XMMRegister(n); 352 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()); 353 off += delta; 354 } 355 } 356 357 #if COMPILER2_OR_JVMCI 358 if (save_wide_vectors) { 359 // Save upper half of YMM registers(0..15) 360 off = ymm0_off; 361 delta = ymm1_off - ymm0_off; 362 for (int n = 0; n < 16; n++) { 363 XMMRegister ymm_name = as_XMMRegister(n); 364 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4)); 365 off += delta; 366 } 367 if (VM_Version::supports_evex()) { 368 // Save upper half of ZMM registers(0..15) 369 off = zmm0_off; 370 delta = zmm1_off - zmm0_off; 371 for (int n = 0; n < 16; n++) { 372 XMMRegister zmm_name = as_XMMRegister(n); 373 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8)); 374 off += delta; 375 } 376 } 377 } 378 #endif // COMPILER2_OR_JVMCI 379 380 // %%% These should all be a waste but we'll keep things as they were for now 381 if (true) { 382 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next()); 383 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next()); 384 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next()); 385 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next()); 386 // rbp location is known implicitly by the frame sender code, needs no oopmap 387 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next()); 388 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next()); 389 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next()); 390 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next()); 391 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next()); 392 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next()); 393 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next()); 394 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next()); 395 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next()); 396 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next()); 397 if (UseAPX) { 398 map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next()); 399 map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next()); 400 map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next()); 401 map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next()); 402 map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next()); 403 map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next()); 404 map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next()); 405 map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next()); 406 map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next()); 407 map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next()); 408 map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next()); 409 map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next()); 410 map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next()); 411 map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next()); 412 map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next()); 413 map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next()); 414 } 415 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 416 // on EVEX enabled targets, we get it included in the xsave area 417 off = xmm0H_off; 418 delta = xmm1H_off - off; 419 for (int n = 0; n < 16; n++) { 420 XMMRegister xmm_name = as_XMMRegister(n); 421 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next()); 422 off += delta; 423 } 424 if (UseAVX > 2) { 425 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 426 off = zmm16H_off; 427 delta = zmm17H_off - off; 428 for (int n = 16; n < num_xmm_regs; n++) { 429 XMMRegister zmm_name = as_XMMRegister(n); 430 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next()); 431 off += delta; 432 } 433 } 434 } 435 436 return map; 437 } 438 439 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) { 440 int num_xmm_regs = XMMRegister::available_xmm_registers(); 441 if (frame::arg_reg_save_area_bytes != 0) { 442 // Pop arg register save area 443 __ addptr(rsp, frame::arg_reg_save_area_bytes); 444 } 445 446 #if COMPILER2_OR_JVMCI 447 if (restore_wide_vectors) { 448 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); 449 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 450 } 451 #else 452 assert(!restore_wide_vectors, "vectors are generated only by C2"); 453 #endif 454 455 __ vzeroupper(); 456 457 // On EVEX enabled targets everything is handled in pop fpu state 458 if (restore_wide_vectors) { 459 // Restore upper half of YMM registers (0..15) 460 int base_addr = XSAVE_AREA_YMM_BEGIN; 461 for (int n = 0; n < 16; n++) { 462 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16)); 463 } 464 if (VM_Version::supports_evex()) { 465 // Restore upper half of ZMM registers (0..15) 466 base_addr = XSAVE_AREA_ZMM_BEGIN; 467 for (int n = 0; n < 16; n++) { 468 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32)); 469 } 470 // Restore full ZMM registers(16..num_xmm_regs) 471 base_addr = XSAVE_AREA_UPPERBANK; 472 int vector_len = Assembler::AVX_512bit; 473 int off = 0; 474 for (int n = 16; n < num_xmm_regs; n++) { 475 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 476 } 477 #if COMPILER2_OR_JVMCI 478 base_addr = XSAVE_AREA_OPMASK_BEGIN; 479 off = 0; 480 for (int n = 0; n < KRegister::number_of_registers; n++) { 481 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 482 } 483 #endif 484 } 485 } else { 486 if (VM_Version::supports_evex()) { 487 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 488 int base_addr = XSAVE_AREA_UPPERBANK; 489 int off = 0; 490 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 491 for (int n = 16; n < num_xmm_regs; n++) { 492 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 493 } 494 #if COMPILER2_OR_JVMCI 495 base_addr = XSAVE_AREA_OPMASK_BEGIN; 496 off = 0; 497 for (int n = 0; n < KRegister::number_of_registers; n++) { 498 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 499 } 500 #endif 501 } 502 } 503 504 #if COMPILER2_OR_JVMCI 505 if (UseAPX) { 506 int base_addr = XSAVE_AREA_EGPRS; 507 int off = 0; 508 for (int n = 16; n < Register::number_of_registers; n++) { 509 __ movq(as_Register(n), Address(rsp, base_addr+(off++*8))); 510 } 511 } 512 #endif 513 514 // Recover CPU state 515 __ pop_FPU_state(); 516 __ restore_legacy_gprs(); 517 __ addq(rsp, 8); 518 __ popf(); 519 // Get the rbp described implicitly by the calling convention (no oopMap) 520 __ pop(rbp); 521 } 522 523 void RegisterSaver::restore_result_registers(MacroAssembler* masm) { 524 525 // Just restore result register. Only used by deoptimization. By 526 // now any callee save register that needs to be restored to a c2 527 // caller of the deoptee has been extracted into the vframeArray 528 // and will be stuffed into the c2i adapter we create for later 529 // restoration so only result registers need to be restored here. 530 531 // Restore fp result register 532 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes())); 533 // Restore integer result register 534 __ movptr(rax, Address(rsp, rax_offset_in_bytes())); 535 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes())); 536 537 // Pop all of the register save are off the stack except the return address 538 __ addptr(rsp, return_offset_in_bytes()); 539 } 540 541 // Is vector's size (in bytes) bigger than a size saved by default? 542 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions. 543 bool SharedRuntime::is_wide_vector(int size) { 544 return size > 16; 545 } 546 547 // --------------------------------------------------------------------------- 548 // Read the array of BasicTypes from a signature, and compute where the 549 // arguments should go. Values in the VMRegPair regs array refer to 4-byte 550 // quantities. Values less than VMRegImpl::stack0 are registers, those above 551 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer 552 // as framesizes are fixed. 553 // VMRegImpl::stack0 refers to the first slot 0(sp). 554 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher. 555 // Register up to Register::number_of_registers are the 64-bit 556 // integer registers. 557 558 // Note: the INPUTS in sig_bt are in units of Java argument words, which are 559 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit 560 // units regardless of build. Of course for i486 there is no 64 bit build 561 562 // The Java calling convention is a "shifted" version of the C ABI. 563 // By skipping the first C ABI register we can call non-static jni methods 564 // with small numbers of arguments without having to shuffle the arguments 565 // at all. Since we control the java ABI we ought to at least get some 566 // advantage out of it. 567 568 int SharedRuntime::java_calling_convention(const BasicType *sig_bt, 569 VMRegPair *regs, 570 int total_args_passed) { 571 572 // Create the mapping between argument positions and 573 // registers. 574 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = { 575 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5 576 }; 577 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = { 578 j_farg0, j_farg1, j_farg2, j_farg3, 579 j_farg4, j_farg5, j_farg6, j_farg7 580 }; 581 582 583 uint int_args = 0; 584 uint fp_args = 0; 585 uint stk_args = 0; 586 587 for (int i = 0; i < total_args_passed; i++) { 588 switch (sig_bt[i]) { 589 case T_BOOLEAN: 590 case T_CHAR: 591 case T_BYTE: 592 case T_SHORT: 593 case T_INT: 594 if (int_args < Argument::n_int_register_parameters_j) { 595 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 596 } else { 597 stk_args = align_up(stk_args, 2); 598 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 599 stk_args += 1; 600 } 601 break; 602 case T_VOID: 603 // halves of T_LONG or T_DOUBLE 604 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 605 regs[i].set_bad(); 606 break; 607 case T_LONG: 608 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 609 // fall through 610 case T_OBJECT: 611 case T_ARRAY: 612 case T_ADDRESS: 613 if (int_args < Argument::n_int_register_parameters_j) { 614 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 615 } else { 616 stk_args = align_up(stk_args, 2); 617 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 618 stk_args += 2; 619 } 620 break; 621 case T_FLOAT: 622 if (fp_args < Argument::n_float_register_parameters_j) { 623 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 624 } else { 625 stk_args = align_up(stk_args, 2); 626 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 627 stk_args += 1; 628 } 629 break; 630 case T_DOUBLE: 631 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 632 if (fp_args < Argument::n_float_register_parameters_j) { 633 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 634 } else { 635 stk_args = align_up(stk_args, 2); 636 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 637 stk_args += 2; 638 } 639 break; 640 default: 641 ShouldNotReachHere(); 642 break; 643 } 644 } 645 646 return stk_args; 647 } 648 649 // Patch the callers callsite with entry to compiled code if it exists. 650 static void patch_callers_callsite(MacroAssembler *masm) { 651 Label L; 652 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 653 __ jcc(Assembler::equal, L); 654 655 // Save the current stack pointer 656 __ mov(r13, rsp); 657 // Schedule the branch target address early. 658 // Call into the VM to patch the caller, then jump to compiled callee 659 // rax isn't live so capture return address while we easily can 660 __ movptr(rax, Address(rsp, 0)); 661 662 // align stack so push_CPU_state doesn't fault 663 __ andptr(rsp, -(StackAlignmentInBytes)); 664 __ push_CPU_state(); 665 __ vzeroupper(); 666 // VM needs caller's callsite 667 // VM needs target method 668 // This needs to be a long call since we will relocate this adapter to 669 // the codeBuffer and it may not reach 670 671 // Allocate argument register save area 672 if (frame::arg_reg_save_area_bytes != 0) { 673 __ subptr(rsp, frame::arg_reg_save_area_bytes); 674 } 675 __ mov(c_rarg0, rbx); 676 __ mov(c_rarg1, rax); 677 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite))); 678 679 // De-allocate argument register save area 680 if (frame::arg_reg_save_area_bytes != 0) { 681 __ addptr(rsp, frame::arg_reg_save_area_bytes); 682 } 683 684 __ vzeroupper(); 685 __ pop_CPU_state(); 686 // restore sp 687 __ mov(rsp, r13); 688 __ bind(L); 689 } 690 691 692 static void gen_c2i_adapter(MacroAssembler *masm, 693 int total_args_passed, 694 int comp_args_on_stack, 695 const BasicType *sig_bt, 696 const VMRegPair *regs, 697 Label& skip_fixup) { 698 // Before we get into the guts of the C2I adapter, see if we should be here 699 // at all. We've come from compiled code and are attempting to jump to the 700 // interpreter, which means the caller made a static call to get here 701 // (vcalls always get a compiled target if there is one). Check for a 702 // compiled target. If there is one, we need to patch the caller's call. 703 patch_callers_callsite(masm); 704 705 __ bind(skip_fixup); 706 707 // Since all args are passed on the stack, total_args_passed * 708 // Interpreter::stackElementSize is the space we need. 709 710 assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed); 711 712 int extraspace = (total_args_passed * Interpreter::stackElementSize); 713 714 // stack is aligned, keep it that way 715 // This is not currently needed or enforced by the interpreter, but 716 // we might as well conform to the ABI. 717 extraspace = align_up(extraspace, 2*wordSize); 718 719 // set senderSP value 720 __ lea(r13, Address(rsp, wordSize)); 721 722 #ifdef ASSERT 723 __ check_stack_alignment(r13, "sender stack not aligned"); 724 #endif 725 if (extraspace > 0) { 726 // Pop the return address 727 __ pop(rax); 728 729 __ subptr(rsp, extraspace); 730 731 // Push the return address 732 __ push(rax); 733 734 // Account for the return address location since we store it first rather 735 // than hold it in a register across all the shuffling 736 extraspace += wordSize; 737 } 738 739 #ifdef ASSERT 740 __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax); 741 #endif 742 743 // Now write the args into the outgoing interpreter space 744 for (int i = 0; i < total_args_passed; i++) { 745 if (sig_bt[i] == T_VOID) { 746 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 747 continue; 748 } 749 750 // offset to start parameters 751 int st_off = (total_args_passed - i) * Interpreter::stackElementSize; 752 int next_off = st_off - Interpreter::stackElementSize; 753 754 // Say 4 args: 755 // i st_off 756 // 0 32 T_LONG 757 // 1 24 T_VOID 758 // 2 16 T_OBJECT 759 // 3 8 T_BOOL 760 // - 0 return address 761 // 762 // However to make thing extra confusing. Because we can fit a long/double in 763 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter 764 // leaves one slot empty and only stores to a single slot. In this case the 765 // slot that is occupied is the T_VOID slot. See I said it was confusing. 766 767 VMReg r_1 = regs[i].first(); 768 VMReg r_2 = regs[i].second(); 769 if (!r_1->is_valid()) { 770 assert(!r_2->is_valid(), ""); 771 continue; 772 } 773 if (r_1->is_stack()) { 774 // memory to memory use rax 775 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace; 776 if (!r_2->is_valid()) { 777 // sign extend?? 778 __ movl(rax, Address(rsp, ld_off)); 779 __ movptr(Address(rsp, st_off), rax); 780 781 } else { 782 783 __ movq(rax, Address(rsp, ld_off)); 784 785 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 786 // T_DOUBLE and T_LONG use two slots in the interpreter 787 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 788 // ld_off == LSW, ld_off+wordSize == MSW 789 // st_off == MSW, next_off == LSW 790 __ movq(Address(rsp, next_off), rax); 791 #ifdef ASSERT 792 // Overwrite the unused slot with known junk 793 __ mov64(rax, CONST64(0xdeadffffdeadaaaa)); 794 __ movptr(Address(rsp, st_off), rax); 795 #endif /* ASSERT */ 796 } else { 797 __ movq(Address(rsp, st_off), rax); 798 } 799 } 800 } else if (r_1->is_Register()) { 801 Register r = r_1->as_Register(); 802 if (!r_2->is_valid()) { 803 // must be only an int (or less ) so move only 32bits to slot 804 // why not sign extend?? 805 __ movl(Address(rsp, st_off), r); 806 } else { 807 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 808 // T_DOUBLE and T_LONG use two slots in the interpreter 809 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 810 // long/double in gpr 811 #ifdef ASSERT 812 // Overwrite the unused slot with known junk 813 __ mov64(rax, CONST64(0xdeadffffdeadaaab)); 814 __ movptr(Address(rsp, st_off), rax); 815 #endif /* ASSERT */ 816 __ movq(Address(rsp, next_off), r); 817 } else { 818 __ movptr(Address(rsp, st_off), r); 819 } 820 } 821 } else { 822 assert(r_1->is_XMMRegister(), ""); 823 if (!r_2->is_valid()) { 824 // only a float use just part of the slot 825 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister()); 826 } else { 827 #ifdef ASSERT 828 // Overwrite the unused slot with known junk 829 __ mov64(rax, CONST64(0xdeadffffdeadaaac)); 830 __ movptr(Address(rsp, st_off), rax); 831 #endif /* ASSERT */ 832 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister()); 833 } 834 } 835 } 836 837 // Schedule the branch target address early. 838 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset()))); 839 __ jmp(rcx); 840 } 841 842 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg, 843 address code_start, address code_end, 844 Label& L_ok) { 845 Label L_fail; 846 __ lea(temp_reg, ExternalAddress(code_start)); 847 __ cmpptr(pc_reg, temp_reg); 848 __ jcc(Assembler::belowEqual, L_fail); 849 __ lea(temp_reg, ExternalAddress(code_end)); 850 __ cmpptr(pc_reg, temp_reg); 851 __ jcc(Assembler::below, L_ok); 852 __ bind(L_fail); 853 } 854 855 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm, 856 int total_args_passed, 857 int comp_args_on_stack, 858 const BasicType *sig_bt, 859 const VMRegPair *regs) { 860 861 // Note: r13 contains the senderSP on entry. We must preserve it since 862 // we may do a i2c -> c2i transition if we lose a race where compiled 863 // code goes non-entrant while we get args ready. 864 // In addition we use r13 to locate all the interpreter args as 865 // we must align the stack to 16 bytes on an i2c entry else we 866 // lose alignment we expect in all compiled code and register 867 // save code can segv when fxsave instructions find improperly 868 // aligned stack pointer. 869 870 // Adapters can be frameless because they do not require the caller 871 // to perform additional cleanup work, such as correcting the stack pointer. 872 // An i2c adapter is frameless because the *caller* frame, which is interpreted, 873 // routinely repairs its own stack pointer (from interpreter_frame_last_sp), 874 // even if a callee has modified the stack pointer. 875 // A c2i adapter is frameless because the *callee* frame, which is interpreted, 876 // routinely repairs its caller's stack pointer (from sender_sp, which is set 877 // up via the senderSP register). 878 // In other words, if *either* the caller or callee is interpreted, we can 879 // get the stack pointer repaired after a call. 880 // This is why c2i and i2c adapters cannot be indefinitely composed. 881 // In particular, if a c2i adapter were to somehow call an i2c adapter, 882 // both caller and callee would be compiled methods, and neither would 883 // clean up the stack pointer changes performed by the two adapters. 884 // If this happens, control eventually transfers back to the compiled 885 // caller, but with an uncorrected stack, causing delayed havoc. 886 887 if (VerifyAdapterCalls && 888 (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) { 889 // So, let's test for cascading c2i/i2c adapters right now. 890 // assert(Interpreter::contains($return_addr) || 891 // StubRoutines::contains($return_addr), 892 // "i2c adapter must return to an interpreter frame"); 893 __ block_comment("verify_i2c { "); 894 // Pick up the return address 895 __ movptr(rax, Address(rsp, 0)); 896 Label L_ok; 897 if (Interpreter::code() != nullptr) { 898 range_check(masm, rax, r11, 899 Interpreter::code()->code_start(), 900 Interpreter::code()->code_end(), 901 L_ok); 902 } 903 if (StubRoutines::initial_stubs_code() != nullptr) { 904 range_check(masm, rax, r11, 905 StubRoutines::initial_stubs_code()->code_begin(), 906 StubRoutines::initial_stubs_code()->code_end(), 907 L_ok); 908 } 909 if (StubRoutines::final_stubs_code() != nullptr) { 910 range_check(masm, rax, r11, 911 StubRoutines::final_stubs_code()->code_begin(), 912 StubRoutines::final_stubs_code()->code_end(), 913 L_ok); 914 } 915 const char* msg = "i2c adapter must return to an interpreter frame"; 916 __ block_comment(msg); 917 __ stop(msg); 918 __ bind(L_ok); 919 __ block_comment("} verify_i2ce "); 920 } 921 922 // Must preserve original SP for loading incoming arguments because 923 // we need to align the outgoing SP for compiled code. 924 __ movptr(r11, rsp); 925 926 // Pick up the return address 927 __ pop(rax); 928 929 // Convert 4-byte c2 stack slots to words. 930 int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord; 931 932 if (comp_args_on_stack) { 933 __ subptr(rsp, comp_words_on_stack * wordSize); 934 } 935 936 // Ensure compiled code always sees stack at proper alignment 937 __ andptr(rsp, -16); 938 939 // push the return address and misalign the stack that youngest frame always sees 940 // as far as the placement of the call instruction 941 __ push(rax); 942 943 // Put saved SP in another register 944 const Register saved_sp = rax; 945 __ movptr(saved_sp, r11); 946 947 // Will jump to the compiled code just as if compiled code was doing it. 948 // Pre-load the register-jump target early, to schedule it better. 949 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset()))); 950 951 #if INCLUDE_JVMCI 952 if (EnableJVMCI) { 953 // check if this call should be routed towards a specific entry point 954 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 955 Label no_alternative_target; 956 __ jcc(Assembler::equal, no_alternative_target); 957 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset()))); 958 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 959 __ bind(no_alternative_target); 960 } 961 #endif // INCLUDE_JVMCI 962 963 // Now generate the shuffle code. Pick up all register args and move the 964 // rest through the floating point stack top. 965 for (int i = 0; i < total_args_passed; i++) { 966 if (sig_bt[i] == T_VOID) { 967 // Longs and doubles are passed in native word order, but misaligned 968 // in the 32-bit build. 969 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 970 continue; 971 } 972 973 // Pick up 0, 1 or 2 words from SP+offset. 974 975 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), 976 "scrambled load targets?"); 977 // Load in argument order going down. 978 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize; 979 // Point to interpreter value (vs. tag) 980 int next_off = ld_off - Interpreter::stackElementSize; 981 // 982 // 983 // 984 VMReg r_1 = regs[i].first(); 985 VMReg r_2 = regs[i].second(); 986 if (!r_1->is_valid()) { 987 assert(!r_2->is_valid(), ""); 988 continue; 989 } 990 if (r_1->is_stack()) { 991 // Convert stack slot to an SP offset (+ wordSize to account for return address ) 992 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize; 993 994 // We can use r13 as a temp here because compiled code doesn't need r13 as an input 995 // and if we end up going thru a c2i because of a miss a reasonable value of r13 996 // will be generated. 997 if (!r_2->is_valid()) { 998 // sign extend??? 999 __ movl(r13, Address(saved_sp, ld_off)); 1000 __ movptr(Address(rsp, st_off), r13); 1001 } else { 1002 // 1003 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 1004 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 1005 // So we must adjust where to pick up the data to match the interpreter. 1006 // 1007 // Interpreter local[n] == MSW, local[n+1] == LSW however locals 1008 // are accessed as negative so LSW is at LOW address 1009 1010 // ld_off is MSW so get LSW 1011 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 1012 next_off : ld_off; 1013 __ movq(r13, Address(saved_sp, offset)); 1014 // st_off is LSW (i.e. reg.first()) 1015 __ movq(Address(rsp, st_off), r13); 1016 } 1017 } else if (r_1->is_Register()) { // Register argument 1018 Register r = r_1->as_Register(); 1019 assert(r != rax, "must be different"); 1020 if (r_2->is_valid()) { 1021 // 1022 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 1023 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 1024 // So we must adjust where to pick up the data to match the interpreter. 1025 1026 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 1027 next_off : ld_off; 1028 1029 // this can be a misaligned move 1030 __ movq(r, Address(saved_sp, offset)); 1031 } else { 1032 // sign extend and use a full word? 1033 __ movl(r, Address(saved_sp, ld_off)); 1034 } 1035 } else { 1036 if (!r_2->is_valid()) { 1037 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off)); 1038 } else { 1039 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off)); 1040 } 1041 } 1042 } 1043 1044 __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about 1045 1046 // 6243940 We might end up in handle_wrong_method if 1047 // the callee is deoptimized as we race thru here. If that 1048 // happens we don't want to take a safepoint because the 1049 // caller frame will look interpreted and arguments are now 1050 // "compiled" so it is much better to make this transition 1051 // invisible to the stack walking code. Unfortunately if 1052 // we try and find the callee by normal means a safepoint 1053 // is possible. So we stash the desired callee in the thread 1054 // and the vm will find there should this case occur. 1055 1056 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx); 1057 1058 // put Method* where a c2i would expect should we end up there 1059 // only needed because eof c2 resolve stubs return Method* as a result in 1060 // rax 1061 __ mov(rax, rbx); 1062 __ jmp(r11); 1063 } 1064 1065 // --------------------------------------------------------------- 1066 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm, 1067 int total_args_passed, 1068 int comp_args_on_stack, 1069 const BasicType *sig_bt, 1070 const VMRegPair *regs, 1071 AdapterFingerPrint* fingerprint) { 1072 address i2c_entry = __ pc(); 1073 1074 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs); 1075 1076 // ------------------------------------------------------------------------- 1077 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls 1078 // to the interpreter. The args start out packed in the compiled layout. They 1079 // need to be unpacked into the interpreter layout. This will almost always 1080 // require some stack space. We grow the current (compiled) stack, then repack 1081 // the args. We finally end in a jump to the generic interpreter entry point. 1082 // On exit from the interpreter, the interpreter will restore our SP (lest the 1083 // compiled code, which relies solely on SP and not RBP, get sick). 1084 1085 address c2i_unverified_entry = __ pc(); 1086 Label skip_fixup; 1087 1088 Register data = rax; 1089 Register receiver = j_rarg0; 1090 Register temp = rbx; 1091 1092 { 1093 __ ic_check(1 /* end_alignment */); 1094 __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset())); 1095 // Method might have been compiled since the call site was patched to 1096 // interpreted if that is the case treat it as a miss so we can get 1097 // the call site corrected. 1098 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 1099 __ jcc(Assembler::equal, skip_fixup); 1100 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1101 } 1102 1103 address c2i_entry = __ pc(); 1104 1105 // Class initialization barrier for static methods 1106 address c2i_no_clinit_check_entry = nullptr; 1107 if (VM_Version::supports_fast_class_init_checks()) { 1108 Label L_skip_barrier; 1109 Register method = rbx; 1110 1111 { // Bypass the barrier for non-static methods 1112 Register flags = rscratch1; 1113 __ movl(flags, Address(method, Method::access_flags_offset())); 1114 __ testl(flags, JVM_ACC_STATIC); 1115 __ jcc(Assembler::zero, L_skip_barrier); // non-static 1116 } 1117 1118 Register klass = rscratch1; 1119 __ load_method_holder(klass, method); 1120 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1121 1122 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1123 1124 __ bind(L_skip_barrier); 1125 c2i_no_clinit_check_entry = __ pc(); 1126 } 1127 1128 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1129 bs->c2i_entry_barrier(masm); 1130 1131 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup); 1132 1133 return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry); 1134 } 1135 1136 int SharedRuntime::c_calling_convention(const BasicType *sig_bt, 1137 VMRegPair *regs, 1138 int total_args_passed) { 1139 1140 // We return the amount of VMRegImpl stack slots we need to reserve for all 1141 // the arguments NOT counting out_preserve_stack_slots. 1142 1143 // NOTE: These arrays will have to change when c1 is ported 1144 #ifdef _WIN64 1145 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1146 c_rarg0, c_rarg1, c_rarg2, c_rarg3 1147 }; 1148 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1149 c_farg0, c_farg1, c_farg2, c_farg3 1150 }; 1151 #else 1152 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1153 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5 1154 }; 1155 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1156 c_farg0, c_farg1, c_farg2, c_farg3, 1157 c_farg4, c_farg5, c_farg6, c_farg7 1158 }; 1159 #endif // _WIN64 1160 1161 1162 uint int_args = 0; 1163 uint fp_args = 0; 1164 uint stk_args = 0; // inc by 2 each time 1165 1166 for (int i = 0; i < total_args_passed; i++) { 1167 switch (sig_bt[i]) { 1168 case T_BOOLEAN: 1169 case T_CHAR: 1170 case T_BYTE: 1171 case T_SHORT: 1172 case T_INT: 1173 if (int_args < Argument::n_int_register_parameters_c) { 1174 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 1175 #ifdef _WIN64 1176 fp_args++; 1177 // Allocate slots for callee to stuff register args the stack. 1178 stk_args += 2; 1179 #endif 1180 } else { 1181 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1182 stk_args += 2; 1183 } 1184 break; 1185 case T_LONG: 1186 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1187 // fall through 1188 case T_OBJECT: 1189 case T_ARRAY: 1190 case T_ADDRESS: 1191 case T_METADATA: 1192 if (int_args < Argument::n_int_register_parameters_c) { 1193 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 1194 #ifdef _WIN64 1195 fp_args++; 1196 stk_args += 2; 1197 #endif 1198 } else { 1199 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1200 stk_args += 2; 1201 } 1202 break; 1203 case T_FLOAT: 1204 if (fp_args < Argument::n_float_register_parameters_c) { 1205 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 1206 #ifdef _WIN64 1207 int_args++; 1208 // Allocate slots for callee to stuff register args the stack. 1209 stk_args += 2; 1210 #endif 1211 } else { 1212 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1213 stk_args += 2; 1214 } 1215 break; 1216 case T_DOUBLE: 1217 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1218 if (fp_args < Argument::n_float_register_parameters_c) { 1219 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 1220 #ifdef _WIN64 1221 int_args++; 1222 // Allocate slots for callee to stuff register args the stack. 1223 stk_args += 2; 1224 #endif 1225 } else { 1226 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1227 stk_args += 2; 1228 } 1229 break; 1230 case T_VOID: // Halves of longs and doubles 1231 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 1232 regs[i].set_bad(); 1233 break; 1234 default: 1235 ShouldNotReachHere(); 1236 break; 1237 } 1238 } 1239 #ifdef _WIN64 1240 // windows abi requires that we always allocate enough stack space 1241 // for 4 64bit registers to be stored down. 1242 if (stk_args < 8) { 1243 stk_args = 8; 1244 } 1245 #endif // _WIN64 1246 1247 return stk_args; 1248 } 1249 1250 int SharedRuntime::vector_calling_convention(VMRegPair *regs, 1251 uint num_bits, 1252 uint total_args_passed) { 1253 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512, 1254 "only certain vector sizes are supported for now"); 1255 1256 static const XMMRegister VEC_ArgReg[32] = { 1257 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 1258 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1259 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23, 1260 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31 1261 }; 1262 1263 uint stk_args = 0; 1264 uint fp_args = 0; 1265 1266 for (uint i = 0; i < total_args_passed; i++) { 1267 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg(); 1268 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15)); 1269 regs[i].set_pair(vmreg->next(next_val), vmreg); 1270 } 1271 1272 return stk_args; 1273 } 1274 1275 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1276 // We always ignore the frame_slots arg and just use the space just below frame pointer 1277 // which by this time is free to use 1278 switch (ret_type) { 1279 case T_FLOAT: 1280 __ movflt(Address(rbp, -wordSize), xmm0); 1281 break; 1282 case T_DOUBLE: 1283 __ movdbl(Address(rbp, -wordSize), xmm0); 1284 break; 1285 case T_VOID: break; 1286 default: { 1287 __ movptr(Address(rbp, -wordSize), rax); 1288 } 1289 } 1290 } 1291 1292 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1293 // We always ignore the frame_slots arg and just use the space just below frame pointer 1294 // which by this time is free to use 1295 switch (ret_type) { 1296 case T_FLOAT: 1297 __ movflt(xmm0, Address(rbp, -wordSize)); 1298 break; 1299 case T_DOUBLE: 1300 __ movdbl(xmm0, Address(rbp, -wordSize)); 1301 break; 1302 case T_VOID: break; 1303 default: { 1304 __ movptr(rax, Address(rbp, -wordSize)); 1305 } 1306 } 1307 } 1308 1309 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1310 for ( int i = first_arg ; i < arg_count ; i++ ) { 1311 if (args[i].first()->is_Register()) { 1312 __ push(args[i].first()->as_Register()); 1313 } else if (args[i].first()->is_XMMRegister()) { 1314 __ subptr(rsp, 2*wordSize); 1315 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister()); 1316 } 1317 } 1318 } 1319 1320 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1321 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) { 1322 if (args[i].first()->is_Register()) { 1323 __ pop(args[i].first()->as_Register()); 1324 } else if (args[i].first()->is_XMMRegister()) { 1325 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0)); 1326 __ addptr(rsp, 2*wordSize); 1327 } 1328 } 1329 } 1330 1331 static void verify_oop_args(MacroAssembler* masm, 1332 const methodHandle& method, 1333 const BasicType* sig_bt, 1334 const VMRegPair* regs) { 1335 Register temp_reg = rbx; // not part of any compiled calling seq 1336 if (VerifyOops) { 1337 for (int i = 0; i < method->size_of_parameters(); i++) { 1338 if (is_reference_type(sig_bt[i])) { 1339 VMReg r = regs[i].first(); 1340 assert(r->is_valid(), "bad oop arg"); 1341 if (r->is_stack()) { 1342 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1343 __ verify_oop(temp_reg); 1344 } else { 1345 __ verify_oop(r->as_Register()); 1346 } 1347 } 1348 } 1349 } 1350 } 1351 1352 static void check_continuation_enter_argument(VMReg actual_vmreg, 1353 Register expected_reg, 1354 const char* name) { 1355 assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name); 1356 assert(actual_vmreg->as_Register() == expected_reg, 1357 "%s is in unexpected register: %s instead of %s", 1358 name, actual_vmreg->as_Register()->name(), expected_reg->name()); 1359 } 1360 1361 1362 //---------------------------- continuation_enter_setup --------------------------- 1363 // 1364 // Arguments: 1365 // None. 1366 // 1367 // Results: 1368 // rsp: pointer to blank ContinuationEntry 1369 // 1370 // Kills: 1371 // rax 1372 // 1373 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) { 1374 assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, ""); 1375 assert(in_bytes(ContinuationEntry::cont_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1376 assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1377 1378 stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize; 1379 __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1380 1381 int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size; 1382 OopMap* map = new OopMap(frame_size, 0); 1383 1384 __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset())); 1385 __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax); 1386 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp); 1387 1388 return map; 1389 } 1390 1391 //---------------------------- fill_continuation_entry --------------------------- 1392 // 1393 // Arguments: 1394 // rsp: pointer to blank Continuation entry 1395 // reg_cont_obj: pointer to the continuation 1396 // reg_flags: flags 1397 // 1398 // Results: 1399 // rsp: pointer to filled out ContinuationEntry 1400 // 1401 // Kills: 1402 // rax 1403 // 1404 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) { 1405 assert_different_registers(rax, reg_cont_obj, reg_flags); 1406 #ifdef ASSERT 1407 __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value()); 1408 #endif 1409 __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj); 1410 __ movl (Address(rsp, ContinuationEntry::flags_offset()), reg_flags); 1411 __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0); 1412 __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0); 1413 __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0); 1414 1415 __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset())); 1416 __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax); 1417 __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset())); 1418 __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax); 1419 1420 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0); 1421 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0); 1422 } 1423 1424 //---------------------------- continuation_enter_cleanup --------------------------- 1425 // 1426 // Arguments: 1427 // rsp: pointer to the ContinuationEntry 1428 // 1429 // Results: 1430 // rsp: pointer to the spilled rbp in the entry frame 1431 // 1432 // Kills: 1433 // rbx 1434 // 1435 void static continuation_enter_cleanup(MacroAssembler* masm) { 1436 #ifdef ASSERT 1437 Label L_good_sp; 1438 __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1439 __ jcc(Assembler::equal, L_good_sp); 1440 __ stop("Incorrect rsp at continuation_enter_cleanup"); 1441 __ bind(L_good_sp); 1442 #endif 1443 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset())); 1444 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx); 1445 1446 if (CheckJNICalls) { 1447 // Check if this is a virtual thread continuation 1448 Label L_skip_vthread_code; 1449 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0); 1450 __ jcc(Assembler::equal, L_skip_vthread_code); 1451 1452 // If the held monitor count is > 0 and this vthread is terminating then 1453 // it failed to release a JNI monitor. So we issue the same log message 1454 // that JavaThread::exit does. 1455 __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1456 __ jcc(Assembler::equal, L_skip_vthread_code); 1457 1458 // rax may hold an exception oop, save it before the call 1459 __ push(rax); 1460 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held)); 1461 __ pop(rax); 1462 1463 // For vthreads we have to explicitly zero the JNI monitor count of the carrier 1464 // on termination. The held count is implicitly zeroed below when we restore from 1465 // the parent held count (which has to be zero). 1466 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1467 1468 __ bind(L_skip_vthread_code); 1469 } 1470 #ifdef ASSERT 1471 else { 1472 // Check if this is a virtual thread continuation 1473 Label L_skip_vthread_code; 1474 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0); 1475 __ jcc(Assembler::equal, L_skip_vthread_code); 1476 1477 // See comment just above. If not checking JNI calls the JNI count is only 1478 // needed for assertion checking. 1479 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1480 1481 __ bind(L_skip_vthread_code); 1482 } 1483 #endif 1484 1485 __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset())); 1486 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx); 1487 1488 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset())); 1489 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx); 1490 __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1491 } 1492 1493 static void gen_continuation_enter(MacroAssembler* masm, 1494 const VMRegPair* regs, 1495 int& exception_offset, 1496 OopMapSet* oop_maps, 1497 int& frame_complete, 1498 int& stack_slots, 1499 int& interpreted_entry_offset, 1500 int& compiled_entry_offset) { 1501 1502 // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread) 1503 int pos_cont_obj = 0; 1504 int pos_is_cont = 1; 1505 int pos_is_virtual = 2; 1506 1507 // The platform-specific calling convention may present the arguments in various registers. 1508 // To simplify the rest of the code, we expect the arguments to reside at these known 1509 // registers, and we additionally check the placement here in case calling convention ever 1510 // changes. 1511 Register reg_cont_obj = c_rarg1; 1512 Register reg_is_cont = c_rarg2; 1513 Register reg_is_virtual = c_rarg3; 1514 1515 check_continuation_enter_argument(regs[pos_cont_obj].first(), reg_cont_obj, "Continuation object"); 1516 check_continuation_enter_argument(regs[pos_is_cont].first(), reg_is_cont, "isContinue"); 1517 check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread"); 1518 1519 // Utility methods kill rax, make sure there are no collisions 1520 assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual); 1521 1522 AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(), 1523 relocInfo::static_call_type); 1524 1525 address start = __ pc(); 1526 1527 Label L_thaw, L_exit; 1528 1529 // i2i entry used at interp_only_mode only 1530 interpreted_entry_offset = __ pc() - start; 1531 { 1532 #ifdef ASSERT 1533 Label is_interp_only; 1534 __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0); 1535 __ jcc(Assembler::notEqual, is_interp_only); 1536 __ stop("enterSpecial interpreter entry called when not in interp_only_mode"); 1537 __ bind(is_interp_only); 1538 #endif 1539 1540 __ pop(rax); // return address 1541 // Read interpreter arguments into registers (this is an ad-hoc i2c adapter) 1542 __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2)); 1543 __ movl(c_rarg2, Address(rsp, Interpreter::stackElementSize*1)); 1544 __ movl(c_rarg3, Address(rsp, Interpreter::stackElementSize*0)); 1545 __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment 1546 __ push(rax); // return address 1547 __ push_cont_fastpath(); 1548 1549 __ enter(); 1550 1551 stack_slots = 2; // will be adjusted in setup 1552 OopMap* map = continuation_enter_setup(masm, stack_slots); 1553 // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe, 1554 // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway. 1555 1556 __ verify_oop(reg_cont_obj); 1557 1558 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1559 1560 // If continuation, call to thaw. Otherwise, resolve the call and exit. 1561 __ testptr(reg_is_cont, reg_is_cont); 1562 __ jcc(Assembler::notZero, L_thaw); 1563 1564 // --- Resolve path 1565 1566 // Make sure the call is patchable 1567 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1568 // Emit stub for static call 1569 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc()); 1570 if (stub == nullptr) { 1571 fatal("CodeCache is full at gen_continuation_enter"); 1572 } 1573 __ call(resolve); 1574 oop_maps->add_gc_map(__ pc() - start, map); 1575 __ post_call_nop(); 1576 1577 __ jmp(L_exit); 1578 } 1579 1580 // compiled entry 1581 __ align(CodeEntryAlignment); 1582 compiled_entry_offset = __ pc() - start; 1583 __ enter(); 1584 1585 stack_slots = 2; // will be adjusted in setup 1586 OopMap* map = continuation_enter_setup(masm, stack_slots); 1587 1588 // Frame is now completed as far as size and linkage. 1589 frame_complete = __ pc() - start; 1590 1591 __ verify_oop(reg_cont_obj); 1592 1593 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1594 1595 // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue) 1596 __ testptr(reg_is_cont, reg_is_cont); 1597 __ jccb(Assembler::notZero, L_thaw); 1598 1599 // --- call Continuation.enter(Continuation c, boolean isContinue) 1600 1601 // Make sure the call is patchable 1602 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1603 1604 // Emit stub for static call 1605 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc()); 1606 if (stub == nullptr) { 1607 fatal("CodeCache is full at gen_continuation_enter"); 1608 } 1609 1610 // The call needs to be resolved. There's a special case for this in 1611 // SharedRuntime::find_callee_info_helper() which calls 1612 // LinkResolver::resolve_continuation_enter() which resolves the call to 1613 // Continuation.enter(Continuation c, boolean isContinue). 1614 __ call(resolve); 1615 1616 oop_maps->add_gc_map(__ pc() - start, map); 1617 __ post_call_nop(); 1618 1619 __ jmpb(L_exit); 1620 1621 // --- Thawing path 1622 1623 __ bind(L_thaw); 1624 1625 __ call(RuntimeAddress(StubRoutines::cont_thaw())); 1626 1627 ContinuationEntry::_return_pc_offset = __ pc() - start; 1628 oop_maps->add_gc_map(__ pc() - start, map->deep_copy()); 1629 __ post_call_nop(); 1630 1631 // --- Normal exit (resolve/thawing) 1632 1633 __ bind(L_exit); 1634 1635 continuation_enter_cleanup(masm); 1636 __ pop(rbp); 1637 __ ret(0); 1638 1639 // --- Exception handling path 1640 1641 exception_offset = __ pc() - start; 1642 1643 continuation_enter_cleanup(masm); 1644 __ pop(rbp); 1645 1646 __ movptr(c_rarg0, r15_thread); 1647 __ movptr(c_rarg1, Address(rsp, 0)); // return address 1648 1649 // rax still holds the original exception oop, save it before the call 1650 __ push(rax); 1651 1652 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2); 1653 __ movptr(rbx, rax); 1654 1655 // Continue at exception handler: 1656 // rax: exception oop 1657 // rbx: exception handler 1658 // rdx: exception pc 1659 __ pop(rax); 1660 __ verify_oop(rax); 1661 __ pop(rdx); 1662 __ jmp(rbx); 1663 } 1664 1665 static void gen_continuation_yield(MacroAssembler* masm, 1666 const VMRegPair* regs, 1667 OopMapSet* oop_maps, 1668 int& frame_complete, 1669 int& stack_slots, 1670 int& compiled_entry_offset) { 1671 enum layout { 1672 rbp_off, 1673 rbpH_off, 1674 return_off, 1675 return_off2, 1676 framesize // inclusive of return address 1677 }; 1678 stack_slots = framesize / VMRegImpl::slots_per_word; 1679 assert(stack_slots == 2, "recheck layout"); 1680 1681 address start = __ pc(); 1682 compiled_entry_offset = __ pc() - start; 1683 __ enter(); 1684 address the_pc = __ pc(); 1685 1686 frame_complete = the_pc - start; 1687 1688 // This nop must be exactly at the PC we push into the frame info. 1689 // We use this nop for fast CodeBlob lookup, associate the OopMap 1690 // with it right away. 1691 __ post_call_nop(); 1692 OopMap* map = new OopMap(framesize, 1); 1693 oop_maps->add_gc_map(frame_complete, map); 1694 1695 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 1696 __ movptr(c_rarg0, r15_thread); 1697 __ movptr(c_rarg1, rsp); 1698 __ call_VM_leaf(Continuation::freeze_entry(), 2); 1699 __ reset_last_Java_frame(true); 1700 1701 Label L_pinned; 1702 1703 __ testptr(rax, rax); 1704 __ jcc(Assembler::notZero, L_pinned); 1705 1706 __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1707 continuation_enter_cleanup(masm); 1708 __ pop(rbp); 1709 __ ret(0); 1710 1711 __ bind(L_pinned); 1712 1713 // Pinned, return to caller 1714 1715 // handle pending exception thrown by freeze 1716 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 1717 Label ok; 1718 __ jcc(Assembler::equal, ok); 1719 __ leave(); 1720 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 1721 __ bind(ok); 1722 1723 __ leave(); 1724 __ ret(0); 1725 } 1726 1727 static void gen_special_dispatch(MacroAssembler* masm, 1728 const methodHandle& method, 1729 const BasicType* sig_bt, 1730 const VMRegPair* regs) { 1731 verify_oop_args(masm, method, sig_bt, regs); 1732 vmIntrinsics::ID iid = method->intrinsic_id(); 1733 1734 // Now write the args into the outgoing interpreter space 1735 bool has_receiver = false; 1736 Register receiver_reg = noreg; 1737 int member_arg_pos = -1; 1738 Register member_reg = noreg; 1739 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid); 1740 if (ref_kind != 0) { 1741 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument 1742 member_reg = rbx; // known to be free at this point 1743 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind); 1744 } else if (iid == vmIntrinsics::_invokeBasic) { 1745 has_receiver = true; 1746 } else if (iid == vmIntrinsics::_linkToNative) { 1747 member_arg_pos = method->size_of_parameters() - 1; // trailing NativeEntryPoint argument 1748 member_reg = rbx; // known to be free at this point 1749 } else { 1750 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid)); 1751 } 1752 1753 if (member_reg != noreg) { 1754 // Load the member_arg into register, if necessary. 1755 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs); 1756 VMReg r = regs[member_arg_pos].first(); 1757 if (r->is_stack()) { 1758 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1759 } else { 1760 // no data motion is needed 1761 member_reg = r->as_Register(); 1762 } 1763 } 1764 1765 if (has_receiver) { 1766 // Make sure the receiver is loaded into a register. 1767 assert(method->size_of_parameters() > 0, "oob"); 1768 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object"); 1769 VMReg r = regs[0].first(); 1770 assert(r->is_valid(), "bad receiver arg"); 1771 if (r->is_stack()) { 1772 // Porting note: This assumes that compiled calling conventions always 1773 // pass the receiver oop in a register. If this is not true on some 1774 // platform, pick a temp and load the receiver from stack. 1775 fatal("receiver always in a register"); 1776 receiver_reg = j_rarg0; // known to be free at this point 1777 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1778 } else { 1779 // no data motion is needed 1780 receiver_reg = r->as_Register(); 1781 } 1782 } 1783 1784 // Figure out which address we are really jumping to: 1785 MethodHandles::generate_method_handle_dispatch(masm, iid, 1786 receiver_reg, member_reg, /*for_compiler_entry:*/ true); 1787 } 1788 1789 // --------------------------------------------------------------------------- 1790 // Generate a native wrapper for a given method. The method takes arguments 1791 // in the Java compiled code convention, marshals them to the native 1792 // convention (handlizes oops, etc), transitions to native, makes the call, 1793 // returns to java state (possibly blocking), unhandlizes any result and 1794 // returns. 1795 // 1796 // Critical native functions are a shorthand for the use of 1797 // GetPrimtiveArrayCritical and disallow the use of any other JNI 1798 // functions. The wrapper is expected to unpack the arguments before 1799 // passing them to the callee. Critical native functions leave the state _in_Java, 1800 // since they cannot stop for GC. 1801 // Some other parts of JNI setup are skipped like the tear down of the JNI handle 1802 // block and the check for pending exceptions it's impossible for them 1803 // to be thrown. 1804 // 1805 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm, 1806 const methodHandle& method, 1807 int compile_id, 1808 BasicType* in_sig_bt, 1809 VMRegPair* in_regs, 1810 BasicType ret_type) { 1811 if (method->is_continuation_native_intrinsic()) { 1812 int exception_offset = -1; 1813 OopMapSet* oop_maps = new OopMapSet(); 1814 int frame_complete = -1; 1815 int stack_slots = -1; 1816 int interpreted_entry_offset = -1; 1817 int vep_offset = -1; 1818 if (method->is_continuation_enter_intrinsic()) { 1819 gen_continuation_enter(masm, 1820 in_regs, 1821 exception_offset, 1822 oop_maps, 1823 frame_complete, 1824 stack_slots, 1825 interpreted_entry_offset, 1826 vep_offset); 1827 } else if (method->is_continuation_yield_intrinsic()) { 1828 gen_continuation_yield(masm, 1829 in_regs, 1830 oop_maps, 1831 frame_complete, 1832 stack_slots, 1833 vep_offset); 1834 } else { 1835 guarantee(false, "Unknown Continuation native intrinsic"); 1836 } 1837 1838 #ifdef ASSERT 1839 if (method->is_continuation_enter_intrinsic()) { 1840 assert(interpreted_entry_offset != -1, "Must be set"); 1841 assert(exception_offset != -1, "Must be set"); 1842 } else { 1843 assert(interpreted_entry_offset == -1, "Must be unset"); 1844 assert(exception_offset == -1, "Must be unset"); 1845 } 1846 assert(frame_complete != -1, "Must be set"); 1847 assert(stack_slots != -1, "Must be set"); 1848 assert(vep_offset != -1, "Must be set"); 1849 #endif 1850 1851 __ flush(); 1852 nmethod* nm = nmethod::new_native_nmethod(method, 1853 compile_id, 1854 masm->code(), 1855 vep_offset, 1856 frame_complete, 1857 stack_slots, 1858 in_ByteSize(-1), 1859 in_ByteSize(-1), 1860 oop_maps, 1861 exception_offset); 1862 if (nm == nullptr) return nm; 1863 if (method->is_continuation_enter_intrinsic()) { 1864 ContinuationEntry::set_enter_code(nm, interpreted_entry_offset); 1865 } else if (method->is_continuation_yield_intrinsic()) { 1866 _cont_doYield_stub = nm; 1867 } 1868 return nm; 1869 } 1870 1871 if (method->is_method_handle_intrinsic()) { 1872 vmIntrinsics::ID iid = method->intrinsic_id(); 1873 intptr_t start = (intptr_t)__ pc(); 1874 int vep_offset = ((intptr_t)__ pc()) - start; 1875 gen_special_dispatch(masm, 1876 method, 1877 in_sig_bt, 1878 in_regs); 1879 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period 1880 __ flush(); 1881 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually 1882 return nmethod::new_native_nmethod(method, 1883 compile_id, 1884 masm->code(), 1885 vep_offset, 1886 frame_complete, 1887 stack_slots / VMRegImpl::slots_per_word, 1888 in_ByteSize(-1), 1889 in_ByteSize(-1), 1890 nullptr); 1891 } 1892 address native_func = method->native_function(); 1893 assert(native_func != nullptr, "must have function"); 1894 1895 // An OopMap for lock (and class if static) 1896 OopMapSet *oop_maps = new OopMapSet(); 1897 intptr_t start = (intptr_t)__ pc(); 1898 1899 // We have received a description of where all the java arg are located 1900 // on entry to the wrapper. We need to convert these args to where 1901 // the jni function will expect them. To figure out where they go 1902 // we convert the java signature to a C signature by inserting 1903 // the hidden arguments as arg[0] and possibly arg[1] (static method) 1904 1905 const int total_in_args = method->size_of_parameters(); 1906 int total_c_args = total_in_args + (method->is_static() ? 2 : 1); 1907 1908 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args); 1909 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args); 1910 BasicType* in_elem_bt = nullptr; 1911 1912 int argc = 0; 1913 out_sig_bt[argc++] = T_ADDRESS; 1914 if (method->is_static()) { 1915 out_sig_bt[argc++] = T_OBJECT; 1916 } 1917 1918 for (int i = 0; i < total_in_args ; i++ ) { 1919 out_sig_bt[argc++] = in_sig_bt[i]; 1920 } 1921 1922 // Now figure out where the args must be stored and how much stack space 1923 // they require. 1924 int out_arg_slots; 1925 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args); 1926 1927 // Compute framesize for the wrapper. We need to handlize all oops in 1928 // incoming registers 1929 1930 // Calculate the total number of stack slots we will need. 1931 1932 // First count the abi requirement plus all of the outgoing args 1933 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots; 1934 1935 // Now the space for the inbound oop handle area 1936 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers 1937 1938 int oop_handle_offset = stack_slots; 1939 stack_slots += total_save_slots; 1940 1941 // Now any space we need for handlizing a klass if static method 1942 1943 int klass_slot_offset = 0; 1944 int klass_offset = -1; 1945 int lock_slot_offset = 0; 1946 bool is_static = false; 1947 1948 if (method->is_static()) { 1949 klass_slot_offset = stack_slots; 1950 stack_slots += VMRegImpl::slots_per_word; 1951 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size; 1952 is_static = true; 1953 } 1954 1955 // Plus a lock if needed 1956 1957 if (method->is_synchronized()) { 1958 lock_slot_offset = stack_slots; 1959 stack_slots += VMRegImpl::slots_per_word; 1960 } 1961 1962 // Now a place (+2) to save return values or temp during shuffling 1963 // + 4 for return address (which we own) and saved rbp 1964 stack_slots += 6; 1965 1966 // Ok The space we have allocated will look like: 1967 // 1968 // 1969 // FP-> | | 1970 // |---------------------| 1971 // | 2 slots for moves | 1972 // |---------------------| 1973 // | lock box (if sync) | 1974 // |---------------------| <- lock_slot_offset 1975 // | klass (if static) | 1976 // |---------------------| <- klass_slot_offset 1977 // | oopHandle area | 1978 // |---------------------| <- oop_handle_offset (6 java arg registers) 1979 // | outbound memory | 1980 // | based arguments | 1981 // | | 1982 // |---------------------| 1983 // | | 1984 // SP-> | out_preserved_slots | 1985 // 1986 // 1987 1988 1989 // Now compute actual number of stack words we need rounding to make 1990 // stack properly aligned. 1991 stack_slots = align_up(stack_slots, StackAlignmentInSlots); 1992 1993 int stack_size = stack_slots * VMRegImpl::stack_slot_size; 1994 1995 // First thing make an ic check to see if we should even be here 1996 1997 // We are free to use all registers as temps without saving them and 1998 // restoring them except rbp. rbp is the only callee save register 1999 // as far as the interpreter and the compiler(s) are concerned. 2000 2001 const Register receiver = j_rarg0; 2002 2003 Label exception_pending; 2004 2005 assert_different_registers(receiver, rscratch1, rscratch2); 2006 __ verify_oop(receiver); 2007 __ ic_check(8 /* end_alignment */); 2008 2009 int vep_offset = ((intptr_t)__ pc()) - start; 2010 2011 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) { 2012 Label L_skip_barrier; 2013 Register klass = r10; 2014 __ mov_metadata(klass, method->method_holder()); // InstanceKlass* 2015 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 2016 2017 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 2018 2019 __ bind(L_skip_barrier); 2020 } 2021 2022 #ifdef COMPILER1 2023 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available. 2024 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) { 2025 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/); 2026 } 2027 #endif // COMPILER1 2028 2029 // The instruction at the verified entry point must be 5 bytes or longer 2030 // because it can be patched on the fly by make_non_entrant. The stack bang 2031 // instruction fits that requirement. 2032 2033 // Generate stack overflow check 2034 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size()); 2035 2036 // Generate a new frame for the wrapper. 2037 __ enter(); 2038 // -2 because return address is already present and so is saved rbp 2039 __ subptr(rsp, stack_size - 2*wordSize); 2040 2041 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2042 // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub 2043 bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */); 2044 2045 // Frame is now completed as far as size and linkage. 2046 int frame_complete = ((intptr_t)__ pc()) - start; 2047 2048 #ifdef ASSERT 2049 __ check_stack_alignment(rsp, "improperly aligned stack"); 2050 #endif /* ASSERT */ 2051 2052 2053 // We use r14 as the oop handle for the receiver/klass 2054 // It is callee save so it survives the call to native 2055 2056 const Register oop_handle_reg = r14; 2057 2058 // 2059 // We immediately shuffle the arguments so that any vm call we have to 2060 // make from here on out (sync slow path, jvmti, etc.) we will have 2061 // captured the oops from our caller and have a valid oopMap for 2062 // them. 2063 2064 // ----------------- 2065 // The Grand Shuffle 2066 2067 // The Java calling convention is either equal (linux) or denser (win64) than the 2068 // c calling convention. However the because of the jni_env argument the c calling 2069 // convention always has at least one more (and two for static) arguments than Java. 2070 // Therefore if we move the args from java -> c backwards then we will never have 2071 // a register->register conflict and we don't have to build a dependency graph 2072 // and figure out how to break any cycles. 2073 // 2074 2075 // Record esp-based slot for receiver on stack for non-static methods 2076 int receiver_offset = -1; 2077 2078 // This is a trick. We double the stack slots so we can claim 2079 // the oops in the caller's frame. Since we are sure to have 2080 // more args than the caller doubling is enough to make 2081 // sure we can capture all the incoming oop args from the 2082 // caller. 2083 // 2084 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/); 2085 2086 // Mark location of rbp (someday) 2087 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp)); 2088 2089 // Use eax, ebx as temporaries during any memory-memory moves we have to do 2090 // All inbound args are referenced based on rbp and all outbound args via rsp. 2091 2092 2093 #ifdef ASSERT 2094 bool reg_destroyed[Register::number_of_registers]; 2095 bool freg_destroyed[XMMRegister::number_of_registers]; 2096 for ( int r = 0 ; r < Register::number_of_registers ; r++ ) { 2097 reg_destroyed[r] = false; 2098 } 2099 for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) { 2100 freg_destroyed[f] = false; 2101 } 2102 2103 #endif /* ASSERT */ 2104 2105 // For JNI natives the incoming and outgoing registers are offset upwards. 2106 GrowableArray<int> arg_order(2 * total_in_args); 2107 2108 VMRegPair tmp_vmreg; 2109 tmp_vmreg.set2(rbx->as_VMReg()); 2110 2111 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 2112 arg_order.push(i); 2113 arg_order.push(c_arg); 2114 } 2115 2116 int temploc = -1; 2117 for (int ai = 0; ai < arg_order.length(); ai += 2) { 2118 int i = arg_order.at(ai); 2119 int c_arg = arg_order.at(ai + 1); 2120 __ block_comment(err_msg("move %d -> %d", i, c_arg)); 2121 #ifdef ASSERT 2122 if (in_regs[i].first()->is_Register()) { 2123 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!"); 2124 } else if (in_regs[i].first()->is_XMMRegister()) { 2125 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!"); 2126 } 2127 if (out_regs[c_arg].first()->is_Register()) { 2128 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 2129 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 2130 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 2131 } 2132 #endif /* ASSERT */ 2133 switch (in_sig_bt[i]) { 2134 case T_ARRAY: 2135 case T_OBJECT: 2136 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg], 2137 ((i == 0) && (!is_static)), 2138 &receiver_offset); 2139 break; 2140 case T_VOID: 2141 break; 2142 2143 case T_FLOAT: 2144 __ float_move(in_regs[i], out_regs[c_arg]); 2145 break; 2146 2147 case T_DOUBLE: 2148 assert( i + 1 < total_in_args && 2149 in_sig_bt[i + 1] == T_VOID && 2150 out_sig_bt[c_arg+1] == T_VOID, "bad arg list"); 2151 __ double_move(in_regs[i], out_regs[c_arg]); 2152 break; 2153 2154 case T_LONG : 2155 __ long_move(in_regs[i], out_regs[c_arg]); 2156 break; 2157 2158 case T_ADDRESS: assert(false, "found T_ADDRESS in java args"); 2159 2160 default: 2161 __ move32_64(in_regs[i], out_regs[c_arg]); 2162 } 2163 } 2164 2165 int c_arg; 2166 2167 // Pre-load a static method's oop into r14. Used both by locking code and 2168 // the normal JNI call code. 2169 // point c_arg at the first arg that is already loaded in case we 2170 // need to spill before we call out 2171 c_arg = total_c_args - total_in_args; 2172 2173 if (method->is_static()) { 2174 2175 // load oop into a register 2176 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror())); 2177 2178 // Now handlize the static class mirror it's known not-null. 2179 __ movptr(Address(rsp, klass_offset), oop_handle_reg); 2180 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset)); 2181 2182 // Now get the handle 2183 __ lea(oop_handle_reg, Address(rsp, klass_offset)); 2184 // store the klass handle as second argument 2185 __ movptr(c_rarg1, oop_handle_reg); 2186 // and protect the arg if we must spill 2187 c_arg--; 2188 } 2189 2190 // Change state to native (we save the return address in the thread, since it might not 2191 // be pushed on the stack when we do a stack traversal). It is enough that the pc() 2192 // points into the right code segment. It does not have to be the correct return pc. 2193 // We use the same pc/oopMap repeatedly when we call out 2194 2195 intptr_t the_pc = (intptr_t) __ pc(); 2196 oop_maps->add_gc_map(the_pc - start, map); 2197 2198 __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1); 2199 2200 2201 // We have all of the arguments setup at this point. We must not touch any register 2202 // argument registers at this point (what if we save/restore them there are no oop? 2203 2204 if (DTraceMethodProbes) { 2205 // protect the args we've loaded 2206 save_args(masm, total_c_args, c_arg, out_regs); 2207 __ mov_metadata(c_rarg1, method()); 2208 __ call_VM_leaf( 2209 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry), 2210 r15_thread, c_rarg1); 2211 restore_args(masm, total_c_args, c_arg, out_regs); 2212 } 2213 2214 // RedefineClasses() tracing support for obsolete method entry 2215 if (log_is_enabled(Trace, redefine, class, obsolete)) { 2216 // protect the args we've loaded 2217 save_args(masm, total_c_args, c_arg, out_regs); 2218 __ mov_metadata(c_rarg1, method()); 2219 __ call_VM_leaf( 2220 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry), 2221 r15_thread, c_rarg1); 2222 restore_args(masm, total_c_args, c_arg, out_regs); 2223 } 2224 2225 // Lock a synchronized method 2226 2227 // Register definitions used by locking and unlocking 2228 2229 const Register swap_reg = rax; // Must use rax for cmpxchg instruction 2230 const Register obj_reg = rbx; // Will contain the oop 2231 const Register lock_reg = r13; // Address of compiler lock object (BasicLock) 2232 const Register old_hdr = r13; // value of old header at unlock time 2233 2234 Label slow_path_lock; 2235 Label lock_done; 2236 2237 if (method->is_synchronized()) { 2238 Label count_mon; 2239 2240 const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes(); 2241 2242 // Get the handle (the 2nd argument) 2243 __ mov(oop_handle_reg, c_rarg1); 2244 2245 // Get address of the box 2246 2247 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2248 2249 // Load the oop from the handle 2250 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2251 2252 if (LockingMode == LM_MONITOR) { 2253 __ jmp(slow_path_lock); 2254 } else if (LockingMode == LM_LEGACY) { 2255 // Load immediate 1 into swap_reg %rax 2256 __ movl(swap_reg, 1); 2257 2258 // Load (object->mark() | 1) into swap_reg %rax 2259 __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2260 2261 // Save (object->mark() | 1) into BasicLock's displaced header 2262 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2263 2264 // src -> dest iff dest == rax else rax <- dest 2265 __ lock(); 2266 __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2267 __ jcc(Assembler::equal, count_mon); 2268 2269 // Hmm should this move to the slow path code area??? 2270 2271 // Test if the oopMark is an obvious stack pointer, i.e., 2272 // 1) (mark & 3) == 0, and 2273 // 2) rsp <= mark < mark + os::pagesize() 2274 // These 3 tests can be done by evaluating the following 2275 // expression: ((mark - rsp) & (3 - os::vm_page_size())), 2276 // assuming both stack pointer and pagesize have their 2277 // least significant 2 bits clear. 2278 // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg 2279 2280 __ subptr(swap_reg, rsp); 2281 __ andptr(swap_reg, 3 - (int)os::vm_page_size()); 2282 2283 // Save the test result, for recursive case, the result is zero 2284 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2285 __ jcc(Assembler::notEqual, slow_path_lock); 2286 } else { 2287 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2288 __ lightweight_lock(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock); 2289 } 2290 __ bind(count_mon); 2291 __ inc_held_monitor_count(); 2292 2293 // Slow path will re-enter here 2294 __ bind(lock_done); 2295 } 2296 2297 // Finally just about ready to make the JNI call 2298 2299 // get JNIEnv* which is first argument to native 2300 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset()))); 2301 2302 // Now set thread in native 2303 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 2304 2305 __ call(RuntimeAddress(native_func)); 2306 2307 // Verify or restore cpu control state after JNI call 2308 __ restore_cpu_control_state_after_jni(rscratch1); 2309 2310 // Unpack native results. 2311 switch (ret_type) { 2312 case T_BOOLEAN: __ c2bool(rax); break; 2313 case T_CHAR : __ movzwl(rax, rax); break; 2314 case T_BYTE : __ sign_extend_byte (rax); break; 2315 case T_SHORT : __ sign_extend_short(rax); break; 2316 case T_INT : /* nothing to do */ break; 2317 case T_DOUBLE : 2318 case T_FLOAT : 2319 // Result is in xmm0 we'll save as needed 2320 break; 2321 case T_ARRAY: // Really a handle 2322 case T_OBJECT: // Really a handle 2323 break; // can't de-handlize until after safepoint check 2324 case T_VOID: break; 2325 case T_LONG: break; 2326 default : ShouldNotReachHere(); 2327 } 2328 2329 Label after_transition; 2330 2331 // Switch thread to "native transition" state before reading the synchronization state. 2332 // This additional state is necessary because reading and testing the synchronization 2333 // state is not atomic w.r.t. GC, as this scenario demonstrates: 2334 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted. 2335 // VM thread changes sync state to synchronizing and suspends threads for GC. 2336 // Thread A is resumed to finish this native method, but doesn't block here since it 2337 // didn't see any synchronization is progress, and escapes. 2338 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 2339 2340 // Force this write out before the read below 2341 if (!UseSystemMemoryBarrier) { 2342 __ membar(Assembler::Membar_mask_bits( 2343 Assembler::LoadLoad | Assembler::LoadStore | 2344 Assembler::StoreLoad | Assembler::StoreStore)); 2345 } 2346 2347 // check for safepoint operation in progress and/or pending suspend requests 2348 { 2349 Label Continue; 2350 Label slow_path; 2351 2352 __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */); 2353 2354 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2355 __ jcc(Assembler::equal, Continue); 2356 __ bind(slow_path); 2357 2358 // Don't use call_VM as it will see a possible pending exception and forward it 2359 // and never return here preventing us from clearing _last_native_pc down below. 2360 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are 2361 // preserved and correspond to the bcp/locals pointers. So we do a runtime call 2362 // by hand. 2363 // 2364 __ vzeroupper(); 2365 save_native_result(masm, ret_type, stack_slots); 2366 __ mov(c_rarg0, r15_thread); 2367 __ mov(r12, rsp); // remember sp 2368 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2369 __ andptr(rsp, -16); // align stack as required by ABI 2370 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 2371 __ mov(rsp, r12); // restore sp 2372 __ reinit_heapbase(); 2373 // Restore any method result value 2374 restore_native_result(masm, ret_type, stack_slots); 2375 __ bind(Continue); 2376 } 2377 2378 // change thread state 2379 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 2380 __ bind(after_transition); 2381 2382 Label reguard; 2383 Label reguard_done; 2384 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 2385 __ jcc(Assembler::equal, reguard); 2386 __ bind(reguard_done); 2387 2388 // native result if any is live 2389 2390 // Unlock 2391 Label slow_path_unlock; 2392 Label unlock_done; 2393 if (method->is_synchronized()) { 2394 2395 Label fast_done; 2396 2397 // Get locked oop from the handle we passed to jni 2398 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2399 2400 if (LockingMode == LM_LEGACY) { 2401 Label not_recur; 2402 // Simple recursive lock? 2403 __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD); 2404 __ jcc(Assembler::notEqual, not_recur); 2405 __ dec_held_monitor_count(); 2406 __ jmpb(fast_done); 2407 __ bind(not_recur); 2408 } 2409 2410 // Must save rax if it is live now because cmpxchg must use it 2411 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2412 save_native_result(masm, ret_type, stack_slots); 2413 } 2414 2415 if (LockingMode == LM_MONITOR) { 2416 __ jmp(slow_path_unlock); 2417 } else if (LockingMode == LM_LEGACY) { 2418 // get address of the stack lock 2419 __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2420 // get old displaced header 2421 __ movptr(old_hdr, Address(rax, 0)); 2422 2423 // Atomic swap old header if oop still contains the stack lock 2424 __ lock(); 2425 __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2426 __ jcc(Assembler::notEqual, slow_path_unlock); 2427 __ dec_held_monitor_count(); 2428 } else { 2429 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2430 __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock); 2431 __ dec_held_monitor_count(); 2432 } 2433 2434 // slow path re-enters here 2435 __ bind(unlock_done); 2436 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2437 restore_native_result(masm, ret_type, stack_slots); 2438 } 2439 2440 __ bind(fast_done); 2441 } 2442 if (DTraceMethodProbes) { 2443 save_native_result(masm, ret_type, stack_slots); 2444 __ mov_metadata(c_rarg1, method()); 2445 __ call_VM_leaf( 2446 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit), 2447 r15_thread, c_rarg1); 2448 restore_native_result(masm, ret_type, stack_slots); 2449 } 2450 2451 __ reset_last_Java_frame(false); 2452 2453 // Unbox oop result, e.g. JNIHandles::resolve value. 2454 if (is_reference_type(ret_type)) { 2455 __ resolve_jobject(rax /* value */, 2456 r15_thread /* thread */, 2457 rcx /* tmp */); 2458 } 2459 2460 if (CheckJNICalls) { 2461 // clear_pending_jni_exception_check 2462 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD); 2463 } 2464 2465 // reset handle block 2466 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset())); 2467 __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD); 2468 2469 // pop our frame 2470 2471 __ leave(); 2472 2473 // Any exception pending? 2474 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2475 __ jcc(Assembler::notEqual, exception_pending); 2476 2477 // Return 2478 2479 __ ret(0); 2480 2481 // Unexpected paths are out of line and go here 2482 2483 // forward the exception 2484 __ bind(exception_pending); 2485 2486 // and forward the exception 2487 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 2488 2489 // Slow path locking & unlocking 2490 if (method->is_synchronized()) { 2491 2492 // BEGIN Slow path lock 2493 __ bind(slow_path_lock); 2494 2495 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM 2496 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2497 2498 // protect the args we've loaded 2499 save_args(masm, total_c_args, c_arg, out_regs); 2500 2501 __ mov(c_rarg0, obj_reg); 2502 __ mov(c_rarg1, lock_reg); 2503 __ mov(c_rarg2, r15_thread); 2504 2505 // Not a leaf but we have last_Java_frame setup as we want 2506 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3); 2507 restore_args(masm, total_c_args, c_arg, out_regs); 2508 2509 #ifdef ASSERT 2510 { Label L; 2511 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2512 __ jcc(Assembler::equal, L); 2513 __ stop("no pending exception allowed on exit from monitorenter"); 2514 __ bind(L); 2515 } 2516 #endif 2517 __ jmp(lock_done); 2518 2519 // END Slow path lock 2520 2521 // BEGIN Slow path unlock 2522 __ bind(slow_path_unlock); 2523 2524 // If we haven't already saved the native result we must save it now as xmm registers 2525 // are still exposed. 2526 __ vzeroupper(); 2527 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2528 save_native_result(masm, ret_type, stack_slots); 2529 } 2530 2531 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2532 2533 __ mov(c_rarg0, obj_reg); 2534 __ mov(c_rarg2, r15_thread); 2535 __ mov(r12, rsp); // remember sp 2536 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2537 __ andptr(rsp, -16); // align stack as required by ABI 2538 2539 // Save pending exception around call to VM (which contains an EXCEPTION_MARK) 2540 // NOTE that obj_reg == rbx currently 2541 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset()))); 2542 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2543 2544 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2545 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C))); 2546 __ mov(rsp, r12); // restore sp 2547 __ reinit_heapbase(); 2548 #ifdef ASSERT 2549 { 2550 Label L; 2551 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2552 __ jcc(Assembler::equal, L); 2553 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C"); 2554 __ bind(L); 2555 } 2556 #endif /* ASSERT */ 2557 2558 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx); 2559 2560 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2561 restore_native_result(masm, ret_type, stack_slots); 2562 } 2563 __ jmp(unlock_done); 2564 2565 // END Slow path unlock 2566 2567 } // synchronized 2568 2569 // SLOW PATH Reguard the stack if needed 2570 2571 __ bind(reguard); 2572 __ vzeroupper(); 2573 save_native_result(masm, ret_type, stack_slots); 2574 __ mov(r12, rsp); // remember sp 2575 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2576 __ andptr(rsp, -16); // align stack as required by ABI 2577 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 2578 __ mov(rsp, r12); // restore sp 2579 __ reinit_heapbase(); 2580 restore_native_result(masm, ret_type, stack_slots); 2581 // and continue 2582 __ jmp(reguard_done); 2583 2584 2585 2586 __ flush(); 2587 2588 nmethod *nm = nmethod::new_native_nmethod(method, 2589 compile_id, 2590 masm->code(), 2591 vep_offset, 2592 frame_complete, 2593 stack_slots / VMRegImpl::slots_per_word, 2594 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)), 2595 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size), 2596 oop_maps); 2597 2598 return nm; 2599 } 2600 2601 // this function returns the adjust size (in number of words) to a c2i adapter 2602 // activation for use during deoptimization 2603 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) { 2604 return (callee_locals - callee_parameters) * Interpreter::stackElementWords; 2605 } 2606 2607 2608 uint SharedRuntime::out_preserve_stack_slots() { 2609 return 0; 2610 } 2611 2612 2613 // Number of stack slots between incoming argument block and the start of 2614 // a new frame. The PROLOG must add this many slots to the stack. The 2615 // EPILOG must remove this many slots. amd64 needs two slots for 2616 // return address. 2617 uint SharedRuntime::in_preserve_stack_slots() { 2618 return 4 + 2 * VerifyStackAtCalls; 2619 } 2620 2621 //------------------------------generate_deopt_blob---------------------------- 2622 void SharedRuntime::generate_deopt_blob() { 2623 // Allocate space for the code 2624 ResourceMark rm; 2625 // Setup code generation tools 2626 int pad = 0; 2627 if (UseAVX > 2) { 2628 pad += 1024; 2629 } 2630 if (UseAPX) { 2631 pad += 1024; 2632 } 2633 #if INCLUDE_JVMCI 2634 if (EnableJVMCI) { 2635 pad += 512; // Increase the buffer size when compiling for JVMCI 2636 } 2637 #endif 2638 CodeBuffer buffer("deopt_blob", 2560+pad, 1024); 2639 MacroAssembler* masm = new MacroAssembler(&buffer); 2640 int frame_size_in_words; 2641 OopMap* map = nullptr; 2642 OopMapSet *oop_maps = new OopMapSet(); 2643 2644 // ------------- 2645 // This code enters when returning to a de-optimized nmethod. A return 2646 // address has been pushed on the stack, and return values are in 2647 // registers. 2648 // If we are doing a normal deopt then we were called from the patched 2649 // nmethod from the point we returned to the nmethod. So the return 2650 // address on the stack is wrong by NativeCall::instruction_size 2651 // We will adjust the value so it looks like we have the original return 2652 // address on the stack (like when we eagerly deoptimized). 2653 // In the case of an exception pending when deoptimizing, we enter 2654 // with a return address on the stack that points after the call we patched 2655 // into the exception handler. We have the following register state from, 2656 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp). 2657 // rax: exception oop 2658 // rbx: exception handler 2659 // rdx: throwing pc 2660 // So in this case we simply jam rdx into the useless return address and 2661 // the stack looks just like we want. 2662 // 2663 // At this point we need to de-opt. We save the argument return 2664 // registers. We call the first C routine, fetch_unroll_info(). This 2665 // routine captures the return values and returns a structure which 2666 // describes the current frame size and the sizes of all replacement frames. 2667 // The current frame is compiled code and may contain many inlined 2668 // functions, each with their own JVM state. We pop the current frame, then 2669 // push all the new frames. Then we call the C routine unpack_frames() to 2670 // populate these frames. Finally unpack_frames() returns us the new target 2671 // address. Notice that callee-save registers are BLOWN here; they have 2672 // already been captured in the vframeArray at the time the return PC was 2673 // patched. 2674 address start = __ pc(); 2675 Label cont; 2676 2677 // Prolog for non exception case! 2678 2679 // Save everything in sight. 2680 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2681 2682 // Normal deoptimization. Save exec mode for unpack_frames. 2683 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved 2684 __ jmp(cont); 2685 2686 int reexecute_offset = __ pc() - start; 2687 #if INCLUDE_JVMCI && !defined(COMPILER1) 2688 if (EnableJVMCI && UseJVMCICompiler) { 2689 // JVMCI does not use this kind of deoptimization 2690 __ should_not_reach_here(); 2691 } 2692 #endif 2693 2694 // Reexecute case 2695 // return address is the pc describes what bci to do re-execute at 2696 2697 // No need to update map as each call to save_live_registers will produce identical oopmap 2698 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2699 2700 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved 2701 __ jmp(cont); 2702 2703 #if INCLUDE_JVMCI 2704 Label after_fetch_unroll_info_call; 2705 int implicit_exception_uncommon_trap_offset = 0; 2706 int uncommon_trap_offset = 0; 2707 2708 if (EnableJVMCI) { 2709 implicit_exception_uncommon_trap_offset = __ pc() - start; 2710 2711 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset()))); 2712 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD); 2713 2714 uncommon_trap_offset = __ pc() - start; 2715 2716 // Save everything in sight. 2717 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2718 // fetch_unroll_info needs to call last_java_frame() 2719 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2720 2721 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset()))); 2722 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1); 2723 2724 __ movl(r14, Deoptimization::Unpack_reexecute); 2725 __ mov(c_rarg0, r15_thread); 2726 __ movl(c_rarg2, r14); // exec mode 2727 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2728 oop_maps->add_gc_map( __ pc()-start, map->deep_copy()); 2729 2730 __ reset_last_Java_frame(false); 2731 2732 __ jmp(after_fetch_unroll_info_call); 2733 } // EnableJVMCI 2734 #endif // INCLUDE_JVMCI 2735 2736 int exception_offset = __ pc() - start; 2737 2738 // Prolog for exception case 2739 2740 // all registers are dead at this entry point, except for rax, and 2741 // rdx which contain the exception oop and exception pc 2742 // respectively. Set them in TLS and fall thru to the 2743 // unpack_with_exception_in_tls entry point. 2744 2745 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 2746 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax); 2747 2748 int exception_in_tls_offset = __ pc() - start; 2749 2750 // new implementation because exception oop is now passed in JavaThread 2751 2752 // Prolog for exception case 2753 // All registers must be preserved because they might be used by LinearScan 2754 // Exceptiop oop and throwing PC are passed in JavaThread 2755 // tos: stack at point of call to method that threw the exception (i.e. only 2756 // args are on the stack, no return address) 2757 2758 // make room on stack for the return address 2759 // It will be patched later with the throwing pc. The correct value is not 2760 // available now because loading it from memory would destroy registers. 2761 __ push(0); 2762 2763 // Save everything in sight. 2764 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2765 2766 // Now it is safe to overwrite any register 2767 2768 // Deopt during an exception. Save exec mode for unpack_frames. 2769 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved 2770 2771 // load throwing pc from JavaThread and patch it as the return address 2772 // of the current frame. Then clear the field in JavaThread 2773 2774 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2775 __ movptr(Address(rbp, wordSize), rdx); 2776 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2777 2778 #ifdef ASSERT 2779 // verify that there is really an exception oop in JavaThread 2780 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2781 __ verify_oop(rax); 2782 2783 // verify that there is no pending exception 2784 Label no_pending_exception; 2785 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 2786 __ testptr(rax, rax); 2787 __ jcc(Assembler::zero, no_pending_exception); 2788 __ stop("must not have pending exception here"); 2789 __ bind(no_pending_exception); 2790 #endif 2791 2792 __ bind(cont); 2793 2794 // Call C code. Need thread and this frame, but NOT official VM entry 2795 // crud. We cannot block on this call, no GC can happen. 2796 // 2797 // UnrollBlock* fetch_unroll_info(JavaThread* thread) 2798 2799 // fetch_unroll_info needs to call last_java_frame(). 2800 2801 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2802 #ifdef ASSERT 2803 { Label L; 2804 __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD); 2805 __ jcc(Assembler::equal, L); 2806 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared"); 2807 __ bind(L); 2808 } 2809 #endif // ASSERT 2810 __ mov(c_rarg0, r15_thread); 2811 __ movl(c_rarg1, r14); // exec_mode 2812 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info))); 2813 2814 // Need to have an oopmap that tells fetch_unroll_info where to 2815 // find any register it might need. 2816 oop_maps->add_gc_map(__ pc() - start, map); 2817 2818 __ reset_last_Java_frame(false); 2819 2820 #if INCLUDE_JVMCI 2821 if (EnableJVMCI) { 2822 __ bind(after_fetch_unroll_info_call); 2823 } 2824 #endif 2825 2826 // Load UnrollBlock* into rdi 2827 __ mov(rdi, rax); 2828 2829 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset())); 2830 Label noException; 2831 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending? 2832 __ jcc(Assembler::notEqual, noException); 2833 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2834 // QQQ this is useless it was null above 2835 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2836 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD); 2837 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2838 2839 __ verify_oop(rax); 2840 2841 // Overwrite the result registers with the exception results. 2842 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2843 // I think this is useless 2844 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx); 2845 2846 __ bind(noException); 2847 2848 // Only register save data is on the stack. 2849 // Now restore the result registers. Everything else is either dead 2850 // or captured in the vframeArray. 2851 RegisterSaver::restore_result_registers(masm); 2852 2853 // All of the register save area has been popped of the stack. Only the 2854 // return address remains. 2855 2856 // Pop all the frames we must move/replace. 2857 // 2858 // Frame picture (youngest to oldest) 2859 // 1: self-frame (no frame link) 2860 // 2: deopting frame (no frame link) 2861 // 3: caller of deopting frame (could be compiled/interpreted). 2862 // 2863 // Note: by leaving the return address of self-frame on the stack 2864 // and using the size of frame 2 to adjust the stack 2865 // when we are done the return to frame 3 will still be on the stack. 2866 2867 // Pop deoptimized frame 2868 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset())); 2869 __ addptr(rsp, rcx); 2870 2871 // rsp should be pointing at the return address to the caller (3) 2872 2873 // Pick up the initial fp we should save 2874 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2875 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset())); 2876 2877 #ifdef ASSERT 2878 // Compilers generate code that bang the stack by as much as the 2879 // interpreter would need. So this stack banging should never 2880 // trigger a fault. Verify that it does not on non product builds. 2881 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset())); 2882 __ bang_stack_size(rbx, rcx); 2883 #endif 2884 2885 // Load address of array of frame pcs into rcx 2886 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset())); 2887 2888 // Trash the old pc 2889 __ addptr(rsp, wordSize); 2890 2891 // Load address of array of frame sizes into rsi 2892 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset())); 2893 2894 // Load counter into rdx 2895 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset())); 2896 2897 // Now adjust the caller's stack to make up for the extra locals 2898 // but record the original sp so that we can save it in the skeletal interpreter 2899 // frame and the stack walking of interpreter_sender will get the unextended sp 2900 // value and not the "real" sp value. 2901 2902 const Register sender_sp = r8; 2903 2904 __ mov(sender_sp, rsp); 2905 __ movl(rbx, Address(rdi, 2906 Deoptimization::UnrollBlock:: 2907 caller_adjustment_offset())); 2908 __ subptr(rsp, rbx); 2909 2910 // Push interpreter frames in a loop 2911 Label loop; 2912 __ bind(loop); 2913 __ movptr(rbx, Address(rsi, 0)); // Load frame size 2914 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand 2915 __ pushptr(Address(rcx, 0)); // Save return address 2916 __ enter(); // Save old & set new ebp 2917 __ subptr(rsp, rbx); // Prolog 2918 // This value is corrected by layout_activation_impl 2919 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD); 2920 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable 2921 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 2922 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 2923 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 2924 __ decrementl(rdx); // Decrement counter 2925 __ jcc(Assembler::notZero, loop); 2926 __ pushptr(Address(rcx, 0)); // Save final return address 2927 2928 // Re-push self-frame 2929 __ enter(); // Save old & set new ebp 2930 2931 // Allocate a full sized register save area. 2932 // Return address and rbp are in place, so we allocate two less words. 2933 __ subptr(rsp, (frame_size_in_words - 2) * wordSize); 2934 2935 // Restore frame locals after moving the frame 2936 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0); 2937 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2938 2939 // Call C code. Need thread but NOT official VM entry 2940 // crud. We cannot block on this call, no GC can happen. Call should 2941 // restore return values to their stack-slots with the new SP. 2942 // 2943 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode) 2944 2945 // Use rbp because the frames look interpreted now 2946 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 2947 // Don't need the precise return PC here, just precise enough to point into this code blob. 2948 address the_pc = __ pc(); 2949 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1); 2950 2951 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI 2952 __ mov(c_rarg0, r15_thread); 2953 __ movl(c_rarg1, r14); // second arg: exec_mode 2954 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 2955 // Revert SP alignment after call since we're going to do some SP relative addressing below 2956 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset())); 2957 2958 // Set an oopmap for the call site 2959 // Use the same PC we used for the last java frame 2960 oop_maps->add_gc_map(the_pc - start, 2961 new OopMap( frame_size_in_words, 0 )); 2962 2963 // Clear fp AND pc 2964 __ reset_last_Java_frame(true); 2965 2966 // Collect return values 2967 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes())); 2968 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes())); 2969 // I think this is useless (throwing pc?) 2970 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes())); 2971 2972 // Pop self-frame. 2973 __ leave(); // Epilog 2974 2975 // Jump to interpreter 2976 __ ret(0); 2977 2978 // Make sure all code is generated 2979 masm->flush(); 2980 2981 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words); 2982 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset); 2983 #if INCLUDE_JVMCI 2984 if (EnableJVMCI) { 2985 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset); 2986 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset); 2987 } 2988 #endif 2989 } 2990 2991 #ifdef COMPILER2 2992 //------------------------------generate_uncommon_trap_blob-------------------- 2993 void SharedRuntime::generate_uncommon_trap_blob() { 2994 // Allocate space for the code 2995 ResourceMark rm; 2996 // Setup code generation tools 2997 CodeBuffer buffer("uncommon_trap_blob", 2048, 1024); 2998 MacroAssembler* masm = new MacroAssembler(&buffer); 2999 3000 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 3001 3002 address start = __ pc(); 3003 3004 // Push self-frame. We get here with a return address on the 3005 // stack, so rsp is 8-byte aligned until we allocate our frame. 3006 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog! 3007 3008 // No callee saved registers. rbp is assumed implicitly saved 3009 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 3010 3011 // compiler left unloaded_class_index in j_rarg0 move to where the 3012 // runtime expects it. 3013 __ movl(c_rarg1, j_rarg0); 3014 3015 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 3016 3017 // Call C code. Need thread but NOT official VM entry 3018 // crud. We cannot block on this call, no GC can happen. Call should 3019 // capture callee-saved registers as well as return values. 3020 // Thread is in rdi already. 3021 // 3022 // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index); 3023 3024 __ mov(c_rarg0, r15_thread); 3025 __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap); 3026 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 3027 3028 // Set an oopmap for the call site 3029 OopMapSet* oop_maps = new OopMapSet(); 3030 OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0); 3031 3032 // location of rbp is known implicitly by the frame sender code 3033 3034 oop_maps->add_gc_map(__ pc() - start, map); 3035 3036 __ reset_last_Java_frame(false); 3037 3038 // Load UnrollBlock* into rdi 3039 __ mov(rdi, rax); 3040 3041 #ifdef ASSERT 3042 { Label L; 3043 __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()), 3044 Deoptimization::Unpack_uncommon_trap); 3045 __ jcc(Assembler::equal, L); 3046 __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap"); 3047 __ bind(L); 3048 } 3049 #endif 3050 3051 // Pop all the frames we must move/replace. 3052 // 3053 // Frame picture (youngest to oldest) 3054 // 1: self-frame (no frame link) 3055 // 2: deopting frame (no frame link) 3056 // 3: caller of deopting frame (could be compiled/interpreted). 3057 3058 // Pop self-frame. We have no frame, and must rely only on rax and rsp. 3059 __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog! 3060 3061 // Pop deoptimized frame (int) 3062 __ movl(rcx, Address(rdi, 3063 Deoptimization::UnrollBlock:: 3064 size_of_deoptimized_frame_offset())); 3065 __ addptr(rsp, rcx); 3066 3067 // rsp should be pointing at the return address to the caller (3) 3068 3069 // Pick up the initial fp we should save 3070 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 3071 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset())); 3072 3073 #ifdef ASSERT 3074 // Compilers generate code that bang the stack by as much as the 3075 // interpreter would need. So this stack banging should never 3076 // trigger a fault. Verify that it does not on non product builds. 3077 __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset())); 3078 __ bang_stack_size(rbx, rcx); 3079 #endif 3080 3081 // Load address of array of frame pcs into rcx (address*) 3082 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset())); 3083 3084 // Trash the return pc 3085 __ addptr(rsp, wordSize); 3086 3087 // Load address of array of frame sizes into rsi (intptr_t*) 3088 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset())); 3089 3090 // Counter 3091 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset())); // (int) 3092 3093 // Now adjust the caller's stack to make up for the extra locals but 3094 // record the original sp so that we can save it in the skeletal 3095 // interpreter frame and the stack walking of interpreter_sender 3096 // will get the unextended sp value and not the "real" sp value. 3097 3098 const Register sender_sp = r8; 3099 3100 __ mov(sender_sp, rsp); 3101 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset())); // (int) 3102 __ subptr(rsp, rbx); 3103 3104 // Push interpreter frames in a loop 3105 Label loop; 3106 __ bind(loop); 3107 __ movptr(rbx, Address(rsi, 0)); // Load frame size 3108 __ subptr(rbx, 2 * wordSize); // We'll push pc and rbp by hand 3109 __ pushptr(Address(rcx, 0)); // Save return address 3110 __ enter(); // Save old & set new rbp 3111 __ subptr(rsp, rbx); // Prolog 3112 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), 3113 sender_sp); // Make it walkable 3114 // This value is corrected by layout_activation_impl 3115 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD); 3116 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 3117 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 3118 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 3119 __ decrementl(rdx); // Decrement counter 3120 __ jcc(Assembler::notZero, loop); 3121 __ pushptr(Address(rcx, 0)); // Save final return address 3122 3123 // Re-push self-frame 3124 __ enter(); // Save old & set new rbp 3125 __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt); 3126 // Prolog 3127 3128 // Use rbp because the frames look interpreted now 3129 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 3130 // Don't need the precise return PC here, just precise enough to point into this code blob. 3131 address the_pc = __ pc(); 3132 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1); 3133 3134 // Call C code. Need thread but NOT official VM entry 3135 // crud. We cannot block on this call, no GC can happen. Call should 3136 // restore return values to their stack-slots with the new SP. 3137 // Thread is in rdi already. 3138 // 3139 // BasicType unpack_frames(JavaThread* thread, int exec_mode); 3140 3141 __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI 3142 __ mov(c_rarg0, r15_thread); 3143 __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap); 3144 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 3145 3146 // Set an oopmap for the call site 3147 // Use the same PC we used for the last java frame 3148 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 3149 3150 // Clear fp AND pc 3151 __ reset_last_Java_frame(true); 3152 3153 // Pop self-frame. 3154 __ leave(); // Epilog 3155 3156 // Jump to interpreter 3157 __ ret(0); 3158 3159 // Make sure all code is generated 3160 masm->flush(); 3161 3162 _uncommon_trap_blob = UncommonTrapBlob::create(&buffer, oop_maps, 3163 SimpleRuntimeFrame::framesize >> 1); 3164 } 3165 #endif // COMPILER2 3166 3167 //------------------------------generate_handler_blob------ 3168 // 3169 // Generate a special Compile2Runtime blob that saves all registers, 3170 // and setup oopmap. 3171 // 3172 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) { 3173 assert(StubRoutines::forward_exception_entry() != nullptr, 3174 "must be generated before"); 3175 3176 ResourceMark rm; 3177 OopMapSet *oop_maps = new OopMapSet(); 3178 OopMap* map; 3179 3180 // Allocate space for the code. Setup code generation tools. 3181 CodeBuffer buffer("handler_blob", 2348, 1024); 3182 MacroAssembler* masm = new MacroAssembler(&buffer); 3183 3184 address start = __ pc(); 3185 address call_pc = nullptr; 3186 int frame_size_in_words; 3187 bool cause_return = (poll_type == POLL_AT_RETURN); 3188 bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP); 3189 3190 // Make room for return address (or push it again) 3191 if (!cause_return) { 3192 __ push(rbx); 3193 } 3194 3195 // Save registers, fpu state, and flags 3196 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors); 3197 3198 // The following is basically a call_VM. However, we need the precise 3199 // address of the call in order to generate an oopmap. Hence, we do all the 3200 // work ourselves. 3201 3202 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next: 3203 3204 // The return address must always be correct so that frame constructor never 3205 // sees an invalid pc. 3206 3207 if (!cause_return) { 3208 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack. 3209 // Additionally, rbx is a callee saved register and we can look at it later to determine 3210 // if someone changed the return address for us! 3211 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset())); 3212 __ movptr(Address(rbp, wordSize), rbx); 3213 } 3214 3215 // Do the call 3216 __ mov(c_rarg0, r15_thread); 3217 __ call(RuntimeAddress(call_ptr)); 3218 3219 // Set an oopmap for the call site. This oopmap will map all 3220 // oop-registers and debug-info registers as callee-saved. This 3221 // will allow deoptimization at this safepoint to find all possible 3222 // debug-info recordings, as well as let GC find all oops. 3223 3224 oop_maps->add_gc_map( __ pc() - start, map); 3225 3226 Label noException; 3227 3228 __ reset_last_Java_frame(false); 3229 3230 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3231 __ jcc(Assembler::equal, noException); 3232 3233 // Exception pending 3234 3235 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3236 3237 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3238 3239 // No exception case 3240 __ bind(noException); 3241 3242 Label no_adjust; 3243 #ifdef ASSERT 3244 Label bail; 3245 #endif 3246 if (!cause_return) { 3247 Label no_prefix, not_special; 3248 3249 // If our stashed return pc was modified by the runtime we avoid touching it 3250 __ cmpptr(rbx, Address(rbp, wordSize)); 3251 __ jccb(Assembler::notEqual, no_adjust); 3252 3253 // Skip over the poll instruction. 3254 // See NativeInstruction::is_safepoint_poll() 3255 // Possible encodings: 3256 // 85 00 test %eax,(%rax) 3257 // 85 01 test %eax,(%rcx) 3258 // 85 02 test %eax,(%rdx) 3259 // 85 03 test %eax,(%rbx) 3260 // 85 06 test %eax,(%rsi) 3261 // 85 07 test %eax,(%rdi) 3262 // 3263 // 41 85 00 test %eax,(%r8) 3264 // 41 85 01 test %eax,(%r9) 3265 // 41 85 02 test %eax,(%r10) 3266 // 41 85 03 test %eax,(%r11) 3267 // 41 85 06 test %eax,(%r14) 3268 // 41 85 07 test %eax,(%r15) 3269 // 3270 // 85 04 24 test %eax,(%rsp) 3271 // 41 85 04 24 test %eax,(%r12) 3272 // 85 45 00 test %eax,0x0(%rbp) 3273 // 41 85 45 00 test %eax,0x0(%r13) 3274 3275 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix); 3276 __ jcc(Assembler::notEqual, no_prefix); 3277 __ addptr(rbx, 1); 3278 __ bind(no_prefix); 3279 #ifdef ASSERT 3280 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below 3281 #endif 3282 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values: 3283 // r12/rsp 0x04 3284 // r13/rbp 0x05 3285 __ movzbq(rcx, Address(rbx, 1)); 3286 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05 3287 __ subptr(rcx, 4); // looking for 0x00 .. 0x01 3288 __ cmpptr(rcx, 1); 3289 __ jcc(Assembler::above, not_special); 3290 __ addptr(rbx, 1); 3291 __ bind(not_special); 3292 #ifdef ASSERT 3293 // Verify the correct encoding of the poll we're about to skip. 3294 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl); 3295 __ jcc(Assembler::notEqual, bail); 3296 // Mask out the modrm bits 3297 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask); 3298 // rax encodes to 0, so if the bits are nonzero it's incorrect 3299 __ jcc(Assembler::notZero, bail); 3300 #endif 3301 // Adjust return pc forward to step over the safepoint poll instruction 3302 __ addptr(rbx, 2); 3303 __ movptr(Address(rbp, wordSize), rbx); 3304 } 3305 3306 __ bind(no_adjust); 3307 // Normal exit, restore registers and exit. 3308 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3309 __ ret(0); 3310 3311 #ifdef ASSERT 3312 __ bind(bail); 3313 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected"); 3314 #endif 3315 3316 // Make sure all code is generated 3317 masm->flush(); 3318 3319 // Fill-out other meta info 3320 return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words); 3321 } 3322 3323 // 3324 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss 3325 // 3326 // Generate a stub that calls into vm to find out the proper destination 3327 // of a java call. All the argument registers are live at this point 3328 // but since this is generic code we don't know what they are and the caller 3329 // must do any gc of the args. 3330 // 3331 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) { 3332 assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before"); 3333 3334 // allocate space for the code 3335 ResourceMark rm; 3336 3337 CodeBuffer buffer(name, 1552, 512); 3338 MacroAssembler* masm = new MacroAssembler(&buffer); 3339 3340 int frame_size_in_words; 3341 3342 OopMapSet *oop_maps = new OopMapSet(); 3343 OopMap* map = nullptr; 3344 3345 int start = __ offset(); 3346 3347 // No need to save vector registers since they are caller-saved anyway. 3348 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false); 3349 3350 int frame_complete = __ offset(); 3351 3352 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 3353 3354 __ mov(c_rarg0, r15_thread); 3355 3356 __ call(RuntimeAddress(destination)); 3357 3358 3359 // Set an oopmap for the call site. 3360 // We need this not only for callee-saved registers, but also for volatile 3361 // registers that the compiler might be keeping live across a safepoint. 3362 3363 oop_maps->add_gc_map( __ offset() - start, map); 3364 3365 // rax contains the address we are going to jump to assuming no exception got installed 3366 3367 // clear last_Java_sp 3368 __ reset_last_Java_frame(false); 3369 // check for pending exceptions 3370 Label pending; 3371 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3372 __ jcc(Assembler::notEqual, pending); 3373 3374 // get the returned Method* 3375 __ get_vm_result_2(rbx, r15_thread); 3376 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx); 3377 3378 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 3379 3380 RegisterSaver::restore_live_registers(masm); 3381 3382 // We are back to the original state on entry and ready to go. 3383 3384 __ jmp(rax); 3385 3386 // Pending exception after the safepoint 3387 3388 __ bind(pending); 3389 3390 RegisterSaver::restore_live_registers(masm); 3391 3392 // exception pending => remove activation and forward to exception handler 3393 3394 __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD); 3395 3396 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 3397 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3398 3399 // ------------- 3400 // make sure all code is generated 3401 masm->flush(); 3402 3403 // return the blob 3404 // frame_size_words or bytes?? 3405 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); 3406 } 3407 3408 //------------------------------Montgomery multiplication------------------------ 3409 // 3410 3411 #ifndef _WINDOWS 3412 3413 // Subtract 0:b from carry:a. Return carry. 3414 static julong 3415 sub(julong a[], julong b[], julong carry, long len) { 3416 long long i = 0, cnt = len; 3417 julong tmp; 3418 asm volatile("clc; " 3419 "0: ; " 3420 "mov (%[b], %[i], 8), %[tmp]; " 3421 "sbb %[tmp], (%[a], %[i], 8); " 3422 "inc %[i]; dec %[cnt]; " 3423 "jne 0b; " 3424 "mov %[carry], %[tmp]; sbb $0, %[tmp]; " 3425 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp) 3426 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry) 3427 : "memory"); 3428 return tmp; 3429 } 3430 3431 // Multiply (unsigned) Long A by Long B, accumulating the double- 3432 // length result into the accumulator formed of T0, T1, and T2. 3433 #define MACC(A, B, T0, T1, T2) \ 3434 do { \ 3435 unsigned long hi, lo; \ 3436 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3437 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3438 : "r"(A), "a"(B) : "cc"); \ 3439 } while(0) 3440 3441 // As above, but add twice the double-length result into the 3442 // accumulator. 3443 #define MACC2(A, B, T0, T1, T2) \ 3444 do { \ 3445 unsigned long hi, lo; \ 3446 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \ 3447 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3448 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3449 : "r"(A), "a"(B) : "cc"); \ 3450 } while(0) 3451 3452 #else //_WINDOWS 3453 3454 static julong 3455 sub(julong a[], julong b[], julong carry, long len) { 3456 long i; 3457 julong tmp; 3458 unsigned char c = 1; 3459 for (i = 0; i < len; i++) { 3460 c = _addcarry_u64(c, a[i], ~b[i], &tmp); 3461 a[i] = tmp; 3462 } 3463 c = _addcarry_u64(c, carry, ~0, &tmp); 3464 return tmp; 3465 } 3466 3467 // Multiply (unsigned) Long A by Long B, accumulating the double- 3468 // length result into the accumulator formed of T0, T1, and T2. 3469 #define MACC(A, B, T0, T1, T2) \ 3470 do { \ 3471 julong hi, lo; \ 3472 lo = _umul128(A, B, &hi); \ 3473 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3474 c = _addcarry_u64(c, hi, T1, &T1); \ 3475 _addcarry_u64(c, T2, 0, &T2); \ 3476 } while(0) 3477 3478 // As above, but add twice the double-length result into the 3479 // accumulator. 3480 #define MACC2(A, B, T0, T1, T2) \ 3481 do { \ 3482 julong hi, lo; \ 3483 lo = _umul128(A, B, &hi); \ 3484 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3485 c = _addcarry_u64(c, hi, T1, &T1); \ 3486 _addcarry_u64(c, T2, 0, &T2); \ 3487 c = _addcarry_u64(0, lo, T0, &T0); \ 3488 c = _addcarry_u64(c, hi, T1, &T1); \ 3489 _addcarry_u64(c, T2, 0, &T2); \ 3490 } while(0) 3491 3492 #endif //_WINDOWS 3493 3494 // Fast Montgomery multiplication. The derivation of the algorithm is 3495 // in A Cryptographic Library for the Motorola DSP56000, 3496 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3497 3498 static void NOINLINE 3499 montgomery_multiply(julong a[], julong b[], julong n[], 3500 julong m[], julong inv, int len) { 3501 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3502 int i; 3503 3504 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply"); 3505 3506 for (i = 0; i < len; i++) { 3507 int j; 3508 for (j = 0; j < i; j++) { 3509 MACC(a[j], b[i-j], t0, t1, t2); 3510 MACC(m[j], n[i-j], t0, t1, t2); 3511 } 3512 MACC(a[i], b[0], t0, t1, t2); 3513 m[i] = t0 * inv; 3514 MACC(m[i], n[0], t0, t1, t2); 3515 3516 assert(t0 == 0, "broken Montgomery multiply"); 3517 3518 t0 = t1; t1 = t2; t2 = 0; 3519 } 3520 3521 for (i = len; i < 2*len; i++) { 3522 int j; 3523 for (j = i-len+1; j < len; j++) { 3524 MACC(a[j], b[i-j], t0, t1, t2); 3525 MACC(m[j], n[i-j], t0, t1, t2); 3526 } 3527 m[i-len] = t0; 3528 t0 = t1; t1 = t2; t2 = 0; 3529 } 3530 3531 while (t0) 3532 t0 = sub(m, n, t0, len); 3533 } 3534 3535 // Fast Montgomery squaring. This uses asymptotically 25% fewer 3536 // multiplies so it should be up to 25% faster than Montgomery 3537 // multiplication. However, its loop control is more complex and it 3538 // may actually run slower on some machines. 3539 3540 static void NOINLINE 3541 montgomery_square(julong a[], julong n[], 3542 julong m[], julong inv, int len) { 3543 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3544 int i; 3545 3546 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square"); 3547 3548 for (i = 0; i < len; i++) { 3549 int j; 3550 int end = (i+1)/2; 3551 for (j = 0; j < end; j++) { 3552 MACC2(a[j], a[i-j], t0, t1, t2); 3553 MACC(m[j], n[i-j], t0, t1, t2); 3554 } 3555 if ((i & 1) == 0) { 3556 MACC(a[j], a[j], t0, t1, t2); 3557 } 3558 for (; j < i; j++) { 3559 MACC(m[j], n[i-j], t0, t1, t2); 3560 } 3561 m[i] = t0 * inv; 3562 MACC(m[i], n[0], t0, t1, t2); 3563 3564 assert(t0 == 0, "broken Montgomery square"); 3565 3566 t0 = t1; t1 = t2; t2 = 0; 3567 } 3568 3569 for (i = len; i < 2*len; i++) { 3570 int start = i-len+1; 3571 int end = start + (len - start)/2; 3572 int j; 3573 for (j = start; j < end; j++) { 3574 MACC2(a[j], a[i-j], t0, t1, t2); 3575 MACC(m[j], n[i-j], t0, t1, t2); 3576 } 3577 if ((i & 1) == 0) { 3578 MACC(a[j], a[j], t0, t1, t2); 3579 } 3580 for (; j < len; j++) { 3581 MACC(m[j], n[i-j], t0, t1, t2); 3582 } 3583 m[i-len] = t0; 3584 t0 = t1; t1 = t2; t2 = 0; 3585 } 3586 3587 while (t0) 3588 t0 = sub(m, n, t0, len); 3589 } 3590 3591 // Swap words in a longword. 3592 static julong swap(julong x) { 3593 return (x << 32) | (x >> 32); 3594 } 3595 3596 // Copy len longwords from s to d, word-swapping as we go. The 3597 // destination array is reversed. 3598 static void reverse_words(julong *s, julong *d, int len) { 3599 d += len; 3600 while(len-- > 0) { 3601 d--; 3602 *d = swap(*s); 3603 s++; 3604 } 3605 } 3606 3607 // The threshold at which squaring is advantageous was determined 3608 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. 3609 #define MONTGOMERY_SQUARING_THRESHOLD 64 3610 3611 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, 3612 jint len, jlong inv, 3613 jint *m_ints) { 3614 assert(len % 2 == 0, "array length in montgomery_multiply must be even"); 3615 int longwords = len/2; 3616 3617 // Make very sure we don't use so much space that the stack might 3618 // overflow. 512 jints corresponds to an 16384-bit integer and 3619 // will use here a total of 8k bytes of stack space. 3620 int divisor = sizeof(julong) * 4; 3621 guarantee(longwords <= 8192 / divisor, "must be"); 3622 int total_allocation = longwords * sizeof (julong) * 4; 3623 julong *scratch = (julong *)alloca(total_allocation); 3624 3625 // Local scratch arrays 3626 julong 3627 *a = scratch + 0 * longwords, 3628 *b = scratch + 1 * longwords, 3629 *n = scratch + 2 * longwords, 3630 *m = scratch + 3 * longwords; 3631 3632 reverse_words((julong *)a_ints, a, longwords); 3633 reverse_words((julong *)b_ints, b, longwords); 3634 reverse_words((julong *)n_ints, n, longwords); 3635 3636 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords); 3637 3638 reverse_words(m, (julong *)m_ints, longwords); 3639 } 3640 3641 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, 3642 jint len, jlong inv, 3643 jint *m_ints) { 3644 assert(len % 2 == 0, "array length in montgomery_square must be even"); 3645 int longwords = len/2; 3646 3647 // Make very sure we don't use so much space that the stack might 3648 // overflow. 512 jints corresponds to an 16384-bit integer and 3649 // will use here a total of 6k bytes of stack space. 3650 int divisor = sizeof(julong) * 3; 3651 guarantee(longwords <= (8192 / divisor), "must be"); 3652 int total_allocation = longwords * sizeof (julong) * 3; 3653 julong *scratch = (julong *)alloca(total_allocation); 3654 3655 // Local scratch arrays 3656 julong 3657 *a = scratch + 0 * longwords, 3658 *n = scratch + 1 * longwords, 3659 *m = scratch + 2 * longwords; 3660 3661 reverse_words((julong *)a_ints, a, longwords); 3662 reverse_words((julong *)n_ints, n, longwords); 3663 3664 if (len >= MONTGOMERY_SQUARING_THRESHOLD) { 3665 ::montgomery_square(a, n, m, (julong)inv, longwords); 3666 } else { 3667 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords); 3668 } 3669 3670 reverse_words(m, (julong *)m_ints, longwords); 3671 } 3672 3673 #ifdef COMPILER2 3674 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame 3675 // 3676 //------------------------------generate_exception_blob--------------------------- 3677 // creates exception blob at the end 3678 // Using exception blob, this code is jumped from a compiled method. 3679 // (see emit_exception_handler in x86_64.ad file) 3680 // 3681 // Given an exception pc at a call we call into the runtime for the 3682 // handler in this method. This handler might merely restore state 3683 // (i.e. callee save registers) unwind the frame and jump to the 3684 // exception handler for the nmethod if there is no Java level handler 3685 // for the nmethod. 3686 // 3687 // This code is entered with a jmp. 3688 // 3689 // Arguments: 3690 // rax: exception oop 3691 // rdx: exception pc 3692 // 3693 // Results: 3694 // rax: exception oop 3695 // rdx: exception pc in caller or ??? 3696 // destination: exception handler of caller 3697 // 3698 // Note: the exception pc MUST be at a call (precise debug information) 3699 // Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved. 3700 // 3701 3702 void OptoRuntime::generate_exception_blob() { 3703 assert(!OptoRuntime::is_callee_saved_register(RDX_num), ""); 3704 assert(!OptoRuntime::is_callee_saved_register(RAX_num), ""); 3705 assert(!OptoRuntime::is_callee_saved_register(RCX_num), ""); 3706 3707 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 3708 3709 // Allocate space for the code 3710 ResourceMark rm; 3711 // Setup code generation tools 3712 CodeBuffer buffer("exception_blob", 2048, 1024); 3713 int pc_offset = 0; 3714 if (SCCache::load_exception_blob(&buffer, &pc_offset)) { 3715 OopMapSet* oop_maps = new OopMapSet(); 3716 oop_maps->add_gc_map(pc_offset, new OopMap(SimpleRuntimeFrame::framesize, 0)); 3717 3718 // Set exception blob 3719 _exception_blob = ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1); 3720 return; 3721 } 3722 3723 MacroAssembler* masm = new MacroAssembler(&buffer); 3724 address start = __ pc(); 3725 3726 // Exception pc is 'return address' for stack walker 3727 __ push(rdx); 3728 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog 3729 3730 // Save callee-saved registers. See x86_64.ad. 3731 3732 // rbp is an implicitly saved callee saved register (i.e., the calling 3733 // convention will save/restore it in the prolog/epilog). Other than that 3734 // there are no callee save registers now that adapter frames are gone. 3735 3736 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 3737 3738 // Store exception in Thread object. We cannot pass any arguments to the 3739 // handle_exception call, since we do not want to make any assumption 3740 // about the size of the frame where the exception happened in. 3741 // c_rarg0 is either rdi (Linux) or rcx (Windows). 3742 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax); 3743 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 3744 3745 // This call does all the hard work. It checks if an exception handler 3746 // exists in the method. 3747 // If so, it returns the handler address. 3748 // If not, it prepares for stack-unwinding, restoring the callee-save 3749 // registers of the frame being removed. 3750 // 3751 // address OptoRuntime::handle_exception_C(JavaThread* thread) 3752 3753 // At a method handle call, the stack may not be properly aligned 3754 // when returning with an exception. 3755 address the_pc = __ pc(); 3756 __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1); 3757 __ mov(c_rarg0, r15_thread); 3758 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack 3759 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C))); 3760 3761 // Set an oopmap for the call site. This oopmap will only be used if we 3762 // are unwinding the stack. Hence, all locations will be dead. 3763 // Callee-saved registers will be the same as the frame above (i.e., 3764 // handle_exception_stub), since they were restored when we got the 3765 // exception. 3766 3767 OopMapSet* oop_maps = new OopMapSet(); 3768 3769 pc_offset = the_pc - start; 3770 oop_maps->add_gc_map(pc_offset, new OopMap(SimpleRuntimeFrame::framesize, 0)); 3771 3772 __ reset_last_Java_frame(false); 3773 3774 // Restore callee-saved registers 3775 3776 // rbp is an implicitly saved callee-saved register (i.e., the calling 3777 // convention will save restore it in prolog/epilog) Other than that 3778 // there are no callee save registers now that adapter frames are gone. 3779 3780 __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt)); 3781 3782 __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog 3783 __ pop(rdx); // No need for exception pc anymore 3784 3785 // rax: exception handler 3786 3787 // We have a handler in rax (could be deopt blob). 3788 __ mov(r8, rax); 3789 3790 // Get the exception oop 3791 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 3792 // Get the exception pc in case we are deoptimized 3793 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 3794 #ifdef ASSERT 3795 __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD); 3796 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 3797 #endif 3798 // Clear the exception oop so GC no longer processes it as a root. 3799 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD); 3800 3801 // rax: exception oop 3802 // r8: exception handler 3803 // rdx: exception pc 3804 // Jump to handler 3805 3806 __ jmp(r8); 3807 3808 // Make sure all code is generated 3809 masm->flush(); 3810 3811 SCCache::store_exception_blob(&buffer, pc_offset); 3812 // Set exception blob 3813 _exception_blob = ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1); 3814 } 3815 #endif // COMPILER2