1 /* 2 * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #ifndef _WINDOWS 27 #include "alloca.h" 28 #endif 29 #include "asm/macroAssembler.hpp" 30 #include "asm/macroAssembler.inline.hpp" 31 #include "code/compiledIC.hpp" 32 #include "code/debugInfoRec.hpp" 33 #include "code/nativeInst.hpp" 34 #include "code/vtableStubs.hpp" 35 #include "compiler/oopMap.hpp" 36 #include "gc/shared/collectedHeap.hpp" 37 #include "gc/shared/gcLocker.hpp" 38 #include "gc/shared/barrierSet.hpp" 39 #include "gc/shared/barrierSetAssembler.hpp" 40 #include "interpreter/interpreter.hpp" 41 #include "logging/log.hpp" 42 #include "memory/resourceArea.hpp" 43 #include "memory/universe.hpp" 44 #include "oops/klass.inline.hpp" 45 #include "oops/method.inline.hpp" 46 #include "prims/methodHandles.hpp" 47 #include "runtime/continuation.hpp" 48 #include "runtime/continuationEntry.inline.hpp" 49 #include "runtime/globals.hpp" 50 #include "runtime/jniHandles.hpp" 51 #include "runtime/safepointMechanism.hpp" 52 #include "runtime/sharedRuntime.hpp" 53 #include "runtime/signature.hpp" 54 #include "runtime/stubRoutines.hpp" 55 #include "runtime/vframeArray.hpp" 56 #include "runtime/vm_version.hpp" 57 #include "utilities/align.hpp" 58 #include "utilities/checkedCast.hpp" 59 #include "utilities/formatBuffer.hpp" 60 #include "vmreg_x86.inline.hpp" 61 #ifdef COMPILER1 62 #include "c1/c1_Runtime1.hpp" 63 #endif 64 #ifdef COMPILER2 65 #include "opto/runtime.hpp" 66 #endif 67 #if INCLUDE_JVMCI 68 #include "jvmci/jvmciJavaClasses.hpp" 69 #endif 70 71 #define __ masm-> 72 73 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size; 74 75 class SimpleRuntimeFrame { 76 77 public: 78 79 // Most of the runtime stubs have this simple frame layout. 80 // This class exists to make the layout shared in one place. 81 // Offsets are for compiler stack slots, which are jints. 82 enum layout { 83 // The frame sender code expects that rbp will be in the "natural" place and 84 // will override any oopMap setting for it. We must therefore force the layout 85 // so that it agrees with the frame sender code. 86 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt, 87 rbp_off2, 88 return_off, return_off2, 89 framesize 90 }; 91 }; 92 93 class RegisterSaver { 94 // Capture info about frame layout. Layout offsets are in jint 95 // units because compiler frame slots are jints. 96 #define XSAVE_AREA_BEGIN 160 97 #define XSAVE_AREA_YMM_BEGIN 576 98 #define XSAVE_AREA_EGPRS 960 99 #define XSAVE_AREA_OPMASK_BEGIN 1088 100 #define XSAVE_AREA_ZMM_BEGIN 1152 101 #define XSAVE_AREA_UPPERBANK 1664 102 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off 103 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off 104 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off 105 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off 106 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off 107 enum layout { 108 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area 109 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area 110 DEF_XMM_OFFS(0), 111 DEF_XMM_OFFS(1), 112 // 2..15 are implied in range usage 113 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 114 DEF_YMM_OFFS(0), 115 DEF_YMM_OFFS(1), 116 // 2..15 are implied in range usage 117 r31_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt, 118 r31H_off, 119 r30_off, r30H_off, 120 r29_off, r29H_off, 121 r28_off, r28H_off, 122 r27_off, r27H_off, 123 r26_off, r26H_off, 124 r25_off, r25H_off, 125 r24_off, r24H_off, 126 r23_off, r23H_off, 127 r22_off, r22H_off, 128 r21_off, r21H_off, 129 r20_off, r20H_off, 130 r19_off, r19H_off, 131 r18_off, r18H_off, 132 r17_off, r17H_off, 133 r16_off, r16H_off, 134 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 135 DEF_OPMASK_OFFS(0), 136 DEF_OPMASK_OFFS(1), 137 // 2..7 are implied in range usage 138 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 139 DEF_ZMM_OFFS(0), 140 DEF_ZMM_OFFS(1), 141 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt, 142 DEF_ZMM_UPPER_OFFS(16), 143 DEF_ZMM_UPPER_OFFS(17), 144 // 18..31 are implied in range usage 145 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt), 146 fpu_stateH_end, 147 r15_off, r15H_off, 148 r14_off, r14H_off, 149 r13_off, r13H_off, 150 r12_off, r12H_off, 151 r11_off, r11H_off, 152 r10_off, r10H_off, 153 r9_off, r9H_off, 154 r8_off, r8H_off, 155 rdi_off, rdiH_off, 156 rsi_off, rsiH_off, 157 ignore_off, ignoreH_off, // extra copy of rbp 158 rsp_off, rspH_off, 159 rbx_off, rbxH_off, 160 rdx_off, rdxH_off, 161 rcx_off, rcxH_off, 162 rax_off, raxH_off, 163 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state 164 align_off, alignH_off, 165 flags_off, flagsH_off, 166 // The frame sender code expects that rbp will be in the "natural" place and 167 // will override any oopMap setting for it. We must therefore force the layout 168 // so that it agrees with the frame sender code. 169 rbp_off, rbpH_off, // copy of rbp we will restore 170 return_off, returnH_off, // slot for return address 171 reg_save_size // size in compiler stack slots 172 }; 173 174 public: 175 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors); 176 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false); 177 178 // Offsets into the register save area 179 // Used by deoptimization when it is managing result register 180 // values on its own 181 182 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; } 183 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; } 184 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; } 185 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; } 186 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; } 187 188 // During deoptimization only the result registers need to be restored, 189 // all the other values have already been extracted. 190 static void restore_result_registers(MacroAssembler* masm); 191 }; 192 193 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) { 194 int off = 0; 195 int num_xmm_regs = XMMRegister::available_xmm_registers(); 196 #if COMPILER2_OR_JVMCI 197 if (save_wide_vectors && UseAVX == 0) { 198 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX 199 } 200 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 201 #else 202 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI 203 #endif 204 205 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated 206 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs); 207 // OopMap frame size is in compiler stack slots (jint's) not bytes or words 208 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 209 // CodeBlob frame size is in words. 210 int frame_size_in_words = frame_size_in_bytes / wordSize; 211 *total_frame_words = frame_size_in_words; 212 213 // Save registers, fpu state, and flags. 214 // We assume caller has already pushed the return address onto the 215 // stack, so rsp is 8-byte aligned here. 216 // We push rpb twice in this sequence because we want the real rbp 217 // to be under the return like a normal enter. 218 219 __ enter(); // rsp becomes 16-byte aligned here 220 __ pushf(); 221 // Make sure rsp stays 16-byte aligned 222 __ subq(rsp, 8); 223 // Push CPU state in multiple of 16 bytes 224 __ save_legacy_gprs(); 225 __ push_FPU_state(); 226 227 228 // push cpu state handles this on EVEX enabled targets 229 if (save_wide_vectors) { 230 // Save upper half of YMM registers(0..15) 231 int base_addr = XSAVE_AREA_YMM_BEGIN; 232 for (int n = 0; n < 16; n++) { 233 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n)); 234 } 235 if (VM_Version::supports_evex()) { 236 // Save upper half of ZMM registers(0..15) 237 base_addr = XSAVE_AREA_ZMM_BEGIN; 238 for (int n = 0; n < 16; n++) { 239 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n)); 240 } 241 // Save full ZMM registers(16..num_xmm_regs) 242 base_addr = XSAVE_AREA_UPPERBANK; 243 off = 0; 244 int vector_len = Assembler::AVX_512bit; 245 for (int n = 16; n < num_xmm_regs; n++) { 246 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 247 } 248 #if COMPILER2_OR_JVMCI 249 base_addr = XSAVE_AREA_OPMASK_BEGIN; 250 off = 0; 251 for(int n = 0; n < KRegister::number_of_registers; n++) { 252 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 253 } 254 #endif 255 } 256 } else { 257 if (VM_Version::supports_evex()) { 258 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 259 int base_addr = XSAVE_AREA_UPPERBANK; 260 off = 0; 261 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 262 for (int n = 16; n < num_xmm_regs; n++) { 263 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 264 } 265 #if COMPILER2_OR_JVMCI 266 base_addr = XSAVE_AREA_OPMASK_BEGIN; 267 off = 0; 268 for(int n = 0; n < KRegister::number_of_registers; n++) { 269 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 270 } 271 #endif 272 } 273 } 274 275 #if COMPILER2_OR_JVMCI 276 if (UseAPX) { 277 int base_addr = XSAVE_AREA_EGPRS; 278 off = 0; 279 for(int n = 16; n < Register::number_of_registers; n++) { 280 __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n)); 281 } 282 } 283 #endif 284 285 __ vzeroupper(); 286 if (frame::arg_reg_save_area_bytes != 0) { 287 // Allocate argument register save area 288 __ subptr(rsp, frame::arg_reg_save_area_bytes); 289 } 290 291 // Set an oopmap for the call site. This oopmap will map all 292 // oop-registers and debug-info registers as callee-saved. This 293 // will allow deoptimization at this safepoint to find all possible 294 // debug-info recordings, as well as let GC find all oops. 295 296 OopMapSet *oop_maps = new OopMapSet(); 297 OopMap* map = new OopMap(frame_size_in_slots, 0); 298 299 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x)) 300 301 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg()); 302 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg()); 303 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg()); 304 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg()); 305 // rbp location is known implicitly by the frame sender code, needs no oopmap 306 // and the location where rbp was saved by is ignored 307 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg()); 308 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg()); 309 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg()); 310 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg()); 311 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg()); 312 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg()); 313 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg()); 314 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg()); 315 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg()); 316 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg()); 317 318 if (UseAPX) { 319 map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg()); 320 map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg()); 321 map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg()); 322 map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg()); 323 map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg()); 324 map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg()); 325 map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg()); 326 map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg()); 327 map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg()); 328 map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg()); 329 map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg()); 330 map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg()); 331 map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg()); 332 map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg()); 333 map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg()); 334 map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg()); 335 } 336 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 337 // on EVEX enabled targets, we get it included in the xsave area 338 off = xmm0_off; 339 int delta = xmm1_off - off; 340 for (int n = 0; n < 16; n++) { 341 XMMRegister xmm_name = as_XMMRegister(n); 342 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()); 343 off += delta; 344 } 345 if (UseAVX > 2) { 346 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 347 off = zmm16_off; 348 delta = zmm17_off - off; 349 for (int n = 16; n < num_xmm_regs; n++) { 350 XMMRegister zmm_name = as_XMMRegister(n); 351 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()); 352 off += delta; 353 } 354 } 355 356 #if COMPILER2_OR_JVMCI 357 if (save_wide_vectors) { 358 // Save upper half of YMM registers(0..15) 359 off = ymm0_off; 360 delta = ymm1_off - ymm0_off; 361 for (int n = 0; n < 16; n++) { 362 XMMRegister ymm_name = as_XMMRegister(n); 363 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4)); 364 off += delta; 365 } 366 if (VM_Version::supports_evex()) { 367 // Save upper half of ZMM registers(0..15) 368 off = zmm0_off; 369 delta = zmm1_off - zmm0_off; 370 for (int n = 0; n < 16; n++) { 371 XMMRegister zmm_name = as_XMMRegister(n); 372 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8)); 373 off += delta; 374 } 375 } 376 } 377 #endif // COMPILER2_OR_JVMCI 378 379 // %%% These should all be a waste but we'll keep things as they were for now 380 if (true) { 381 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next()); 382 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next()); 383 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next()); 384 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next()); 385 // rbp location is known implicitly by the frame sender code, needs no oopmap 386 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next()); 387 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next()); 388 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next()); 389 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next()); 390 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next()); 391 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next()); 392 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next()); 393 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next()); 394 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next()); 395 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next()); 396 if (UseAPX) { 397 map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next()); 398 map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next()); 399 map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next()); 400 map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next()); 401 map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next()); 402 map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next()); 403 map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next()); 404 map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next()); 405 map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next()); 406 map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next()); 407 map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next()); 408 map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next()); 409 map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next()); 410 map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next()); 411 map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next()); 412 map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next()); 413 } 414 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 415 // on EVEX enabled targets, we get it included in the xsave area 416 off = xmm0H_off; 417 delta = xmm1H_off - off; 418 for (int n = 0; n < 16; n++) { 419 XMMRegister xmm_name = as_XMMRegister(n); 420 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next()); 421 off += delta; 422 } 423 if (UseAVX > 2) { 424 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 425 off = zmm16H_off; 426 delta = zmm17H_off - off; 427 for (int n = 16; n < num_xmm_regs; n++) { 428 XMMRegister zmm_name = as_XMMRegister(n); 429 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next()); 430 off += delta; 431 } 432 } 433 } 434 435 return map; 436 } 437 438 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) { 439 int num_xmm_regs = XMMRegister::available_xmm_registers(); 440 if (frame::arg_reg_save_area_bytes != 0) { 441 // Pop arg register save area 442 __ addptr(rsp, frame::arg_reg_save_area_bytes); 443 } 444 445 #if COMPILER2_OR_JVMCI 446 if (restore_wide_vectors) { 447 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); 448 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 449 } 450 #else 451 assert(!restore_wide_vectors, "vectors are generated only by C2"); 452 #endif 453 454 __ vzeroupper(); 455 456 // On EVEX enabled targets everything is handled in pop fpu state 457 if (restore_wide_vectors) { 458 // Restore upper half of YMM registers (0..15) 459 int base_addr = XSAVE_AREA_YMM_BEGIN; 460 for (int n = 0; n < 16; n++) { 461 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16)); 462 } 463 if (VM_Version::supports_evex()) { 464 // Restore upper half of ZMM registers (0..15) 465 base_addr = XSAVE_AREA_ZMM_BEGIN; 466 for (int n = 0; n < 16; n++) { 467 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32)); 468 } 469 // Restore full ZMM registers(16..num_xmm_regs) 470 base_addr = XSAVE_AREA_UPPERBANK; 471 int vector_len = Assembler::AVX_512bit; 472 int off = 0; 473 for (int n = 16; n < num_xmm_regs; n++) { 474 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 475 } 476 #if COMPILER2_OR_JVMCI 477 base_addr = XSAVE_AREA_OPMASK_BEGIN; 478 off = 0; 479 for (int n = 0; n < KRegister::number_of_registers; n++) { 480 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 481 } 482 #endif 483 } 484 } else { 485 if (VM_Version::supports_evex()) { 486 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 487 int base_addr = XSAVE_AREA_UPPERBANK; 488 int off = 0; 489 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 490 for (int n = 16; n < num_xmm_regs; n++) { 491 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 492 } 493 #if COMPILER2_OR_JVMCI 494 base_addr = XSAVE_AREA_OPMASK_BEGIN; 495 off = 0; 496 for (int n = 0; n < KRegister::number_of_registers; n++) { 497 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 498 } 499 #endif 500 } 501 } 502 503 #if COMPILER2_OR_JVMCI 504 if (UseAPX) { 505 int base_addr = XSAVE_AREA_EGPRS; 506 int off = 0; 507 for (int n = 16; n < Register::number_of_registers; n++) { 508 __ movq(as_Register(n), Address(rsp, base_addr+(off++*8))); 509 } 510 } 511 #endif 512 513 // Recover CPU state 514 __ pop_FPU_state(); 515 __ restore_legacy_gprs(); 516 __ addq(rsp, 8); 517 __ popf(); 518 // Get the rbp described implicitly by the calling convention (no oopMap) 519 __ pop(rbp); 520 } 521 522 void RegisterSaver::restore_result_registers(MacroAssembler* masm) { 523 524 // Just restore result register. Only used by deoptimization. By 525 // now any callee save register that needs to be restored to a c2 526 // caller of the deoptee has been extracted into the vframeArray 527 // and will be stuffed into the c2i adapter we create for later 528 // restoration so only result registers need to be restored here. 529 530 // Restore fp result register 531 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes())); 532 // Restore integer result register 533 __ movptr(rax, Address(rsp, rax_offset_in_bytes())); 534 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes())); 535 536 // Pop all of the register save are off the stack except the return address 537 __ addptr(rsp, return_offset_in_bytes()); 538 } 539 540 // Is vector's size (in bytes) bigger than a size saved by default? 541 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions. 542 bool SharedRuntime::is_wide_vector(int size) { 543 return size > 16; 544 } 545 546 // --------------------------------------------------------------------------- 547 // Read the array of BasicTypes from a signature, and compute where the 548 // arguments should go. Values in the VMRegPair regs array refer to 4-byte 549 // quantities. Values less than VMRegImpl::stack0 are registers, those above 550 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer 551 // as framesizes are fixed. 552 // VMRegImpl::stack0 refers to the first slot 0(sp). 553 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher. 554 // Register up to Register::number_of_registers are the 64-bit 555 // integer registers. 556 557 // Note: the INPUTS in sig_bt are in units of Java argument words, which are 558 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit 559 // units regardless of build. Of course for i486 there is no 64 bit build 560 561 // The Java calling convention is a "shifted" version of the C ABI. 562 // By skipping the first C ABI register we can call non-static jni methods 563 // with small numbers of arguments without having to shuffle the arguments 564 // at all. Since we control the java ABI we ought to at least get some 565 // advantage out of it. 566 567 int SharedRuntime::java_calling_convention(const BasicType *sig_bt, 568 VMRegPair *regs, 569 int total_args_passed) { 570 571 // Create the mapping between argument positions and 572 // registers. 573 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = { 574 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5 575 }; 576 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = { 577 j_farg0, j_farg1, j_farg2, j_farg3, 578 j_farg4, j_farg5, j_farg6, j_farg7 579 }; 580 581 582 uint int_args = 0; 583 uint fp_args = 0; 584 uint stk_args = 0; 585 586 for (int i = 0; i < total_args_passed; i++) { 587 switch (sig_bt[i]) { 588 case T_BOOLEAN: 589 case T_CHAR: 590 case T_BYTE: 591 case T_SHORT: 592 case T_INT: 593 if (int_args < Argument::n_int_register_parameters_j) { 594 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 595 } else { 596 stk_args = align_up(stk_args, 2); 597 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 598 stk_args += 1; 599 } 600 break; 601 case T_VOID: 602 // halves of T_LONG or T_DOUBLE 603 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 604 regs[i].set_bad(); 605 break; 606 case T_LONG: 607 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 608 // fall through 609 case T_OBJECT: 610 case T_ARRAY: 611 case T_ADDRESS: 612 if (int_args < Argument::n_int_register_parameters_j) { 613 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 614 } else { 615 stk_args = align_up(stk_args, 2); 616 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 617 stk_args += 2; 618 } 619 break; 620 case T_FLOAT: 621 if (fp_args < Argument::n_float_register_parameters_j) { 622 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 623 } else { 624 stk_args = align_up(stk_args, 2); 625 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 626 stk_args += 1; 627 } 628 break; 629 case T_DOUBLE: 630 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 631 if (fp_args < Argument::n_float_register_parameters_j) { 632 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 633 } else { 634 stk_args = align_up(stk_args, 2); 635 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 636 stk_args += 2; 637 } 638 break; 639 default: 640 ShouldNotReachHere(); 641 break; 642 } 643 } 644 645 return stk_args; 646 } 647 648 // Patch the callers callsite with entry to compiled code if it exists. 649 static void patch_callers_callsite(MacroAssembler *masm) { 650 Label L; 651 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 652 __ jcc(Assembler::equal, L); 653 654 // Save the current stack pointer 655 __ mov(r13, rsp); 656 // Schedule the branch target address early. 657 // Call into the VM to patch the caller, then jump to compiled callee 658 // rax isn't live so capture return address while we easily can 659 __ movptr(rax, Address(rsp, 0)); 660 661 // align stack so push_CPU_state doesn't fault 662 __ andptr(rsp, -(StackAlignmentInBytes)); 663 __ push_CPU_state(); 664 __ vzeroupper(); 665 // VM needs caller's callsite 666 // VM needs target method 667 // This needs to be a long call since we will relocate this adapter to 668 // the codeBuffer and it may not reach 669 670 // Allocate argument register save area 671 if (frame::arg_reg_save_area_bytes != 0) { 672 __ subptr(rsp, frame::arg_reg_save_area_bytes); 673 } 674 __ mov(c_rarg0, rbx); 675 __ mov(c_rarg1, rax); 676 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite))); 677 678 // De-allocate argument register save area 679 if (frame::arg_reg_save_area_bytes != 0) { 680 __ addptr(rsp, frame::arg_reg_save_area_bytes); 681 } 682 683 __ vzeroupper(); 684 __ pop_CPU_state(); 685 // restore sp 686 __ mov(rsp, r13); 687 __ bind(L); 688 } 689 690 691 static void gen_c2i_adapter(MacroAssembler *masm, 692 int total_args_passed, 693 int comp_args_on_stack, 694 const BasicType *sig_bt, 695 const VMRegPair *regs, 696 Label& skip_fixup) { 697 // Before we get into the guts of the C2I adapter, see if we should be here 698 // at all. We've come from compiled code and are attempting to jump to the 699 // interpreter, which means the caller made a static call to get here 700 // (vcalls always get a compiled target if there is one). Check for a 701 // compiled target. If there is one, we need to patch the caller's call. 702 patch_callers_callsite(masm); 703 704 __ bind(skip_fixup); 705 706 // Since all args are passed on the stack, total_args_passed * 707 // Interpreter::stackElementSize is the space we need. 708 709 assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed); 710 711 int extraspace = (total_args_passed * Interpreter::stackElementSize); 712 713 // stack is aligned, keep it that way 714 // This is not currently needed or enforced by the interpreter, but 715 // we might as well conform to the ABI. 716 extraspace = align_up(extraspace, 2*wordSize); 717 718 // set senderSP value 719 __ lea(r13, Address(rsp, wordSize)); 720 721 #ifdef ASSERT 722 __ check_stack_alignment(r13, "sender stack not aligned"); 723 #endif 724 if (extraspace > 0) { 725 // Pop the return address 726 __ pop(rax); 727 728 __ subptr(rsp, extraspace); 729 730 // Push the return address 731 __ push(rax); 732 733 // Account for the return address location since we store it first rather 734 // than hold it in a register across all the shuffling 735 extraspace += wordSize; 736 } 737 738 #ifdef ASSERT 739 __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax); 740 #endif 741 742 // Now write the args into the outgoing interpreter space 743 for (int i = 0; i < total_args_passed; i++) { 744 if (sig_bt[i] == T_VOID) { 745 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 746 continue; 747 } 748 749 // offset to start parameters 750 int st_off = (total_args_passed - i) * Interpreter::stackElementSize; 751 int next_off = st_off - Interpreter::stackElementSize; 752 753 // Say 4 args: 754 // i st_off 755 // 0 32 T_LONG 756 // 1 24 T_VOID 757 // 2 16 T_OBJECT 758 // 3 8 T_BOOL 759 // - 0 return address 760 // 761 // However to make thing extra confusing. Because we can fit a long/double in 762 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter 763 // leaves one slot empty and only stores to a single slot. In this case the 764 // slot that is occupied is the T_VOID slot. See I said it was confusing. 765 766 VMReg r_1 = regs[i].first(); 767 VMReg r_2 = regs[i].second(); 768 if (!r_1->is_valid()) { 769 assert(!r_2->is_valid(), ""); 770 continue; 771 } 772 if (r_1->is_stack()) { 773 // memory to memory use rax 774 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace; 775 if (!r_2->is_valid()) { 776 // sign extend?? 777 __ movl(rax, Address(rsp, ld_off)); 778 __ movptr(Address(rsp, st_off), rax); 779 780 } else { 781 782 __ movq(rax, Address(rsp, ld_off)); 783 784 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 785 // T_DOUBLE and T_LONG use two slots in the interpreter 786 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 787 // ld_off == LSW, ld_off+wordSize == MSW 788 // st_off == MSW, next_off == LSW 789 __ movq(Address(rsp, next_off), rax); 790 #ifdef ASSERT 791 // Overwrite the unused slot with known junk 792 __ mov64(rax, CONST64(0xdeadffffdeadaaaa)); 793 __ movptr(Address(rsp, st_off), rax); 794 #endif /* ASSERT */ 795 } else { 796 __ movq(Address(rsp, st_off), rax); 797 } 798 } 799 } else if (r_1->is_Register()) { 800 Register r = r_1->as_Register(); 801 if (!r_2->is_valid()) { 802 // must be only an int (or less ) so move only 32bits to slot 803 // why not sign extend?? 804 __ movl(Address(rsp, st_off), r); 805 } else { 806 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 807 // T_DOUBLE and T_LONG use two slots in the interpreter 808 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 809 // long/double in gpr 810 #ifdef ASSERT 811 // Overwrite the unused slot with known junk 812 __ mov64(rax, CONST64(0xdeadffffdeadaaab)); 813 __ movptr(Address(rsp, st_off), rax); 814 #endif /* ASSERT */ 815 __ movq(Address(rsp, next_off), r); 816 } else { 817 __ movptr(Address(rsp, st_off), r); 818 } 819 } 820 } else { 821 assert(r_1->is_XMMRegister(), ""); 822 if (!r_2->is_valid()) { 823 // only a float use just part of the slot 824 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister()); 825 } else { 826 #ifdef ASSERT 827 // Overwrite the unused slot with known junk 828 __ mov64(rax, CONST64(0xdeadffffdeadaaac)); 829 __ movptr(Address(rsp, st_off), rax); 830 #endif /* ASSERT */ 831 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister()); 832 } 833 } 834 } 835 836 // Schedule the branch target address early. 837 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset()))); 838 __ jmp(rcx); 839 } 840 841 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg, 842 address code_start, address code_end, 843 Label& L_ok) { 844 Label L_fail; 845 __ lea(temp_reg, ExternalAddress(code_start)); 846 __ cmpptr(pc_reg, temp_reg); 847 __ jcc(Assembler::belowEqual, L_fail); 848 __ lea(temp_reg, ExternalAddress(code_end)); 849 __ cmpptr(pc_reg, temp_reg); 850 __ jcc(Assembler::below, L_ok); 851 __ bind(L_fail); 852 } 853 854 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm, 855 int total_args_passed, 856 int comp_args_on_stack, 857 const BasicType *sig_bt, 858 const VMRegPair *regs) { 859 860 // Note: r13 contains the senderSP on entry. We must preserve it since 861 // we may do a i2c -> c2i transition if we lose a race where compiled 862 // code goes non-entrant while we get args ready. 863 // In addition we use r13 to locate all the interpreter args as 864 // we must align the stack to 16 bytes on an i2c entry else we 865 // lose alignment we expect in all compiled code and register 866 // save code can segv when fxsave instructions find improperly 867 // aligned stack pointer. 868 869 // Adapters can be frameless because they do not require the caller 870 // to perform additional cleanup work, such as correcting the stack pointer. 871 // An i2c adapter is frameless because the *caller* frame, which is interpreted, 872 // routinely repairs its own stack pointer (from interpreter_frame_last_sp), 873 // even if a callee has modified the stack pointer. 874 // A c2i adapter is frameless because the *callee* frame, which is interpreted, 875 // routinely repairs its caller's stack pointer (from sender_sp, which is set 876 // up via the senderSP register). 877 // In other words, if *either* the caller or callee is interpreted, we can 878 // get the stack pointer repaired after a call. 879 // This is why c2i and i2c adapters cannot be indefinitely composed. 880 // In particular, if a c2i adapter were to somehow call an i2c adapter, 881 // both caller and callee would be compiled methods, and neither would 882 // clean up the stack pointer changes performed by the two adapters. 883 // If this happens, control eventually transfers back to the compiled 884 // caller, but with an uncorrected stack, causing delayed havoc. 885 886 if (VerifyAdapterCalls && 887 (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) { 888 // So, let's test for cascading c2i/i2c adapters right now. 889 // assert(Interpreter::contains($return_addr) || 890 // StubRoutines::contains($return_addr), 891 // "i2c adapter must return to an interpreter frame"); 892 __ block_comment("verify_i2c { "); 893 // Pick up the return address 894 __ movptr(rax, Address(rsp, 0)); 895 Label L_ok; 896 if (Interpreter::code() != nullptr) { 897 range_check(masm, rax, r11, 898 Interpreter::code()->code_start(), 899 Interpreter::code()->code_end(), 900 L_ok); 901 } 902 if (StubRoutines::initial_stubs_code() != nullptr) { 903 range_check(masm, rax, r11, 904 StubRoutines::initial_stubs_code()->code_begin(), 905 StubRoutines::initial_stubs_code()->code_end(), 906 L_ok); 907 } 908 if (StubRoutines::final_stubs_code() != nullptr) { 909 range_check(masm, rax, r11, 910 StubRoutines::final_stubs_code()->code_begin(), 911 StubRoutines::final_stubs_code()->code_end(), 912 L_ok); 913 } 914 const char* msg = "i2c adapter must return to an interpreter frame"; 915 __ block_comment(msg); 916 __ stop(msg); 917 __ bind(L_ok); 918 __ block_comment("} verify_i2ce "); 919 } 920 921 // Must preserve original SP for loading incoming arguments because 922 // we need to align the outgoing SP for compiled code. 923 __ movptr(r11, rsp); 924 925 // Pick up the return address 926 __ pop(rax); 927 928 // Convert 4-byte c2 stack slots to words. 929 int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord; 930 931 if (comp_args_on_stack) { 932 __ subptr(rsp, comp_words_on_stack * wordSize); 933 } 934 935 // Ensure compiled code always sees stack at proper alignment 936 __ andptr(rsp, -16); 937 938 // push the return address and misalign the stack that youngest frame always sees 939 // as far as the placement of the call instruction 940 __ push(rax); 941 942 // Put saved SP in another register 943 const Register saved_sp = rax; 944 __ movptr(saved_sp, r11); 945 946 // Will jump to the compiled code just as if compiled code was doing it. 947 // Pre-load the register-jump target early, to schedule it better. 948 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset()))); 949 950 #if INCLUDE_JVMCI 951 if (EnableJVMCI) { 952 // check if this call should be routed towards a specific entry point 953 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 954 Label no_alternative_target; 955 __ jcc(Assembler::equal, no_alternative_target); 956 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset()))); 957 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 958 __ bind(no_alternative_target); 959 } 960 #endif // INCLUDE_JVMCI 961 962 // Now generate the shuffle code. Pick up all register args and move the 963 // rest through the floating point stack top. 964 for (int i = 0; i < total_args_passed; i++) { 965 if (sig_bt[i] == T_VOID) { 966 // Longs and doubles are passed in native word order, but misaligned 967 // in the 32-bit build. 968 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 969 continue; 970 } 971 972 // Pick up 0, 1 or 2 words from SP+offset. 973 974 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), 975 "scrambled load targets?"); 976 // Load in argument order going down. 977 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize; 978 // Point to interpreter value (vs. tag) 979 int next_off = ld_off - Interpreter::stackElementSize; 980 // 981 // 982 // 983 VMReg r_1 = regs[i].first(); 984 VMReg r_2 = regs[i].second(); 985 if (!r_1->is_valid()) { 986 assert(!r_2->is_valid(), ""); 987 continue; 988 } 989 if (r_1->is_stack()) { 990 // Convert stack slot to an SP offset (+ wordSize to account for return address ) 991 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize; 992 993 // We can use r13 as a temp here because compiled code doesn't need r13 as an input 994 // and if we end up going thru a c2i because of a miss a reasonable value of r13 995 // will be generated. 996 if (!r_2->is_valid()) { 997 // sign extend??? 998 __ movl(r13, Address(saved_sp, ld_off)); 999 __ movptr(Address(rsp, st_off), r13); 1000 } else { 1001 // 1002 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 1003 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 1004 // So we must adjust where to pick up the data to match the interpreter. 1005 // 1006 // Interpreter local[n] == MSW, local[n+1] == LSW however locals 1007 // are accessed as negative so LSW is at LOW address 1008 1009 // ld_off is MSW so get LSW 1010 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 1011 next_off : ld_off; 1012 __ movq(r13, Address(saved_sp, offset)); 1013 // st_off is LSW (i.e. reg.first()) 1014 __ movq(Address(rsp, st_off), r13); 1015 } 1016 } else if (r_1->is_Register()) { // Register argument 1017 Register r = r_1->as_Register(); 1018 assert(r != rax, "must be different"); 1019 if (r_2->is_valid()) { 1020 // 1021 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 1022 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 1023 // So we must adjust where to pick up the data to match the interpreter. 1024 1025 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 1026 next_off : ld_off; 1027 1028 // this can be a misaligned move 1029 __ movq(r, Address(saved_sp, offset)); 1030 } else { 1031 // sign extend and use a full word? 1032 __ movl(r, Address(saved_sp, ld_off)); 1033 } 1034 } else { 1035 if (!r_2->is_valid()) { 1036 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off)); 1037 } else { 1038 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off)); 1039 } 1040 } 1041 } 1042 1043 __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about 1044 1045 // 6243940 We might end up in handle_wrong_method if 1046 // the callee is deoptimized as we race thru here. If that 1047 // happens we don't want to take a safepoint because the 1048 // caller frame will look interpreted and arguments are now 1049 // "compiled" so it is much better to make this transition 1050 // invisible to the stack walking code. Unfortunately if 1051 // we try and find the callee by normal means a safepoint 1052 // is possible. So we stash the desired callee in the thread 1053 // and the vm will find there should this case occur. 1054 1055 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx); 1056 1057 // put Method* where a c2i would expect should we end up there 1058 // only needed because eof c2 resolve stubs return Method* as a result in 1059 // rax 1060 __ mov(rax, rbx); 1061 __ jmp(r11); 1062 } 1063 1064 // --------------------------------------------------------------- 1065 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm, 1066 int total_args_passed, 1067 int comp_args_on_stack, 1068 const BasicType *sig_bt, 1069 const VMRegPair *regs, 1070 AdapterFingerPrint* fingerprint) { 1071 address i2c_entry = __ pc(); 1072 1073 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs); 1074 1075 // ------------------------------------------------------------------------- 1076 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls 1077 // to the interpreter. The args start out packed in the compiled layout. They 1078 // need to be unpacked into the interpreter layout. This will almost always 1079 // require some stack space. We grow the current (compiled) stack, then repack 1080 // the args. We finally end in a jump to the generic interpreter entry point. 1081 // On exit from the interpreter, the interpreter will restore our SP (lest the 1082 // compiled code, which relies solely on SP and not RBP, get sick). 1083 1084 address c2i_unverified_entry = __ pc(); 1085 Label skip_fixup; 1086 1087 Register data = rax; 1088 Register receiver = j_rarg0; 1089 Register temp = rbx; 1090 1091 { 1092 __ ic_check(1 /* end_alignment */); 1093 __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset())); 1094 // Method might have been compiled since the call site was patched to 1095 // interpreted if that is the case treat it as a miss so we can get 1096 // the call site corrected. 1097 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 1098 __ jcc(Assembler::equal, skip_fixup); 1099 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1100 } 1101 1102 address c2i_entry = __ pc(); 1103 1104 // Class initialization barrier for static methods 1105 address c2i_no_clinit_check_entry = nullptr; 1106 if (VM_Version::supports_fast_class_init_checks()) { 1107 Label L_skip_barrier; 1108 Register method = rbx; 1109 1110 { // Bypass the barrier for non-static methods 1111 Register flags = rscratch1; 1112 __ movl(flags, Address(method, Method::access_flags_offset())); 1113 __ testl(flags, JVM_ACC_STATIC); 1114 __ jcc(Assembler::zero, L_skip_barrier); // non-static 1115 } 1116 1117 Register klass = rscratch1; 1118 __ load_method_holder(klass, method); 1119 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1120 1121 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1122 1123 __ bind(L_skip_barrier); 1124 c2i_no_clinit_check_entry = __ pc(); 1125 } 1126 1127 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1128 bs->c2i_entry_barrier(masm); 1129 1130 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup); 1131 1132 return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry); 1133 } 1134 1135 int SharedRuntime::c_calling_convention(const BasicType *sig_bt, 1136 VMRegPair *regs, 1137 int total_args_passed) { 1138 1139 // We return the amount of VMRegImpl stack slots we need to reserve for all 1140 // the arguments NOT counting out_preserve_stack_slots. 1141 1142 // NOTE: These arrays will have to change when c1 is ported 1143 #ifdef _WIN64 1144 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1145 c_rarg0, c_rarg1, c_rarg2, c_rarg3 1146 }; 1147 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1148 c_farg0, c_farg1, c_farg2, c_farg3 1149 }; 1150 #else 1151 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1152 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5 1153 }; 1154 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1155 c_farg0, c_farg1, c_farg2, c_farg3, 1156 c_farg4, c_farg5, c_farg6, c_farg7 1157 }; 1158 #endif // _WIN64 1159 1160 1161 uint int_args = 0; 1162 uint fp_args = 0; 1163 uint stk_args = 0; // inc by 2 each time 1164 1165 for (int i = 0; i < total_args_passed; i++) { 1166 switch (sig_bt[i]) { 1167 case T_BOOLEAN: 1168 case T_CHAR: 1169 case T_BYTE: 1170 case T_SHORT: 1171 case T_INT: 1172 if (int_args < Argument::n_int_register_parameters_c) { 1173 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 1174 #ifdef _WIN64 1175 fp_args++; 1176 // Allocate slots for callee to stuff register args the stack. 1177 stk_args += 2; 1178 #endif 1179 } else { 1180 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1181 stk_args += 2; 1182 } 1183 break; 1184 case T_LONG: 1185 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1186 // fall through 1187 case T_OBJECT: 1188 case T_ARRAY: 1189 case T_ADDRESS: 1190 case T_METADATA: 1191 if (int_args < Argument::n_int_register_parameters_c) { 1192 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 1193 #ifdef _WIN64 1194 fp_args++; 1195 stk_args += 2; 1196 #endif 1197 } else { 1198 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1199 stk_args += 2; 1200 } 1201 break; 1202 case T_FLOAT: 1203 if (fp_args < Argument::n_float_register_parameters_c) { 1204 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 1205 #ifdef _WIN64 1206 int_args++; 1207 // Allocate slots for callee to stuff register args the stack. 1208 stk_args += 2; 1209 #endif 1210 } else { 1211 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1212 stk_args += 2; 1213 } 1214 break; 1215 case T_DOUBLE: 1216 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1217 if (fp_args < Argument::n_float_register_parameters_c) { 1218 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 1219 #ifdef _WIN64 1220 int_args++; 1221 // Allocate slots for callee to stuff register args the stack. 1222 stk_args += 2; 1223 #endif 1224 } else { 1225 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1226 stk_args += 2; 1227 } 1228 break; 1229 case T_VOID: // Halves of longs and doubles 1230 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 1231 regs[i].set_bad(); 1232 break; 1233 default: 1234 ShouldNotReachHere(); 1235 break; 1236 } 1237 } 1238 #ifdef _WIN64 1239 // windows abi requires that we always allocate enough stack space 1240 // for 4 64bit registers to be stored down. 1241 if (stk_args < 8) { 1242 stk_args = 8; 1243 } 1244 #endif // _WIN64 1245 1246 return stk_args; 1247 } 1248 1249 int SharedRuntime::vector_calling_convention(VMRegPair *regs, 1250 uint num_bits, 1251 uint total_args_passed) { 1252 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512, 1253 "only certain vector sizes are supported for now"); 1254 1255 static const XMMRegister VEC_ArgReg[32] = { 1256 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 1257 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1258 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23, 1259 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31 1260 }; 1261 1262 uint stk_args = 0; 1263 uint fp_args = 0; 1264 1265 for (uint i = 0; i < total_args_passed; i++) { 1266 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg(); 1267 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15)); 1268 regs[i].set_pair(vmreg->next(next_val), vmreg); 1269 } 1270 1271 return stk_args; 1272 } 1273 1274 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1275 // We always ignore the frame_slots arg and just use the space just below frame pointer 1276 // which by this time is free to use 1277 switch (ret_type) { 1278 case T_FLOAT: 1279 __ movflt(Address(rbp, -wordSize), xmm0); 1280 break; 1281 case T_DOUBLE: 1282 __ movdbl(Address(rbp, -wordSize), xmm0); 1283 break; 1284 case T_VOID: break; 1285 default: { 1286 __ movptr(Address(rbp, -wordSize), rax); 1287 } 1288 } 1289 } 1290 1291 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1292 // We always ignore the frame_slots arg and just use the space just below frame pointer 1293 // which by this time is free to use 1294 switch (ret_type) { 1295 case T_FLOAT: 1296 __ movflt(xmm0, Address(rbp, -wordSize)); 1297 break; 1298 case T_DOUBLE: 1299 __ movdbl(xmm0, Address(rbp, -wordSize)); 1300 break; 1301 case T_VOID: break; 1302 default: { 1303 __ movptr(rax, Address(rbp, -wordSize)); 1304 } 1305 } 1306 } 1307 1308 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1309 for ( int i = first_arg ; i < arg_count ; i++ ) { 1310 if (args[i].first()->is_Register()) { 1311 __ push(args[i].first()->as_Register()); 1312 } else if (args[i].first()->is_XMMRegister()) { 1313 __ subptr(rsp, 2*wordSize); 1314 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister()); 1315 } 1316 } 1317 } 1318 1319 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1320 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) { 1321 if (args[i].first()->is_Register()) { 1322 __ pop(args[i].first()->as_Register()); 1323 } else if (args[i].first()->is_XMMRegister()) { 1324 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0)); 1325 __ addptr(rsp, 2*wordSize); 1326 } 1327 } 1328 } 1329 1330 static void verify_oop_args(MacroAssembler* masm, 1331 const methodHandle& method, 1332 const BasicType* sig_bt, 1333 const VMRegPair* regs) { 1334 Register temp_reg = rbx; // not part of any compiled calling seq 1335 if (VerifyOops) { 1336 for (int i = 0; i < method->size_of_parameters(); i++) { 1337 if (is_reference_type(sig_bt[i])) { 1338 VMReg r = regs[i].first(); 1339 assert(r->is_valid(), "bad oop arg"); 1340 if (r->is_stack()) { 1341 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1342 __ verify_oop(temp_reg); 1343 } else { 1344 __ verify_oop(r->as_Register()); 1345 } 1346 } 1347 } 1348 } 1349 } 1350 1351 static void check_continuation_enter_argument(VMReg actual_vmreg, 1352 Register expected_reg, 1353 const char* name) { 1354 assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name); 1355 assert(actual_vmreg->as_Register() == expected_reg, 1356 "%s is in unexpected register: %s instead of %s", 1357 name, actual_vmreg->as_Register()->name(), expected_reg->name()); 1358 } 1359 1360 1361 //---------------------------- continuation_enter_setup --------------------------- 1362 // 1363 // Arguments: 1364 // None. 1365 // 1366 // Results: 1367 // rsp: pointer to blank ContinuationEntry 1368 // 1369 // Kills: 1370 // rax 1371 // 1372 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) { 1373 assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, ""); 1374 assert(in_bytes(ContinuationEntry::cont_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1375 assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1376 1377 stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize; 1378 __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1379 1380 int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size; 1381 OopMap* map = new OopMap(frame_size, 0); 1382 1383 __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset())); 1384 __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax); 1385 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp); 1386 1387 return map; 1388 } 1389 1390 //---------------------------- fill_continuation_entry --------------------------- 1391 // 1392 // Arguments: 1393 // rsp: pointer to blank Continuation entry 1394 // reg_cont_obj: pointer to the continuation 1395 // reg_flags: flags 1396 // 1397 // Results: 1398 // rsp: pointer to filled out ContinuationEntry 1399 // 1400 // Kills: 1401 // rax 1402 // 1403 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) { 1404 assert_different_registers(rax, reg_cont_obj, reg_flags); 1405 #ifdef ASSERT 1406 __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value()); 1407 #endif 1408 __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj); 1409 __ movl (Address(rsp, ContinuationEntry::flags_offset()), reg_flags); 1410 __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0); 1411 __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0); 1412 __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0); 1413 1414 __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset())); 1415 __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax); 1416 __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset())); 1417 __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax); 1418 1419 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0); 1420 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0); 1421 } 1422 1423 //---------------------------- continuation_enter_cleanup --------------------------- 1424 // 1425 // Arguments: 1426 // rsp: pointer to the ContinuationEntry 1427 // 1428 // Results: 1429 // rsp: pointer to the spilled rbp in the entry frame 1430 // 1431 // Kills: 1432 // rbx 1433 // 1434 void static continuation_enter_cleanup(MacroAssembler* masm) { 1435 #ifdef ASSERT 1436 Label L_good_sp; 1437 __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1438 __ jcc(Assembler::equal, L_good_sp); 1439 __ stop("Incorrect rsp at continuation_enter_cleanup"); 1440 __ bind(L_good_sp); 1441 #endif 1442 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset())); 1443 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx); 1444 1445 if (CheckJNICalls) { 1446 // Check if this is a virtual thread continuation 1447 Label L_skip_vthread_code; 1448 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0); 1449 __ jcc(Assembler::equal, L_skip_vthread_code); 1450 1451 // If the held monitor count is > 0 and this vthread is terminating then 1452 // it failed to release a JNI monitor. So we issue the same log message 1453 // that JavaThread::exit does. 1454 __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1455 __ jcc(Assembler::equal, L_skip_vthread_code); 1456 1457 // rax may hold an exception oop, save it before the call 1458 __ push(rax); 1459 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held)); 1460 __ pop(rax); 1461 1462 // For vthreads we have to explicitly zero the JNI monitor count of the carrier 1463 // on termination. The held count is implicitly zeroed below when we restore from 1464 // the parent held count (which has to be zero). 1465 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1466 1467 __ bind(L_skip_vthread_code); 1468 } 1469 #ifdef ASSERT 1470 else { 1471 // Check if this is a virtual thread continuation 1472 Label L_skip_vthread_code; 1473 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0); 1474 __ jcc(Assembler::equal, L_skip_vthread_code); 1475 1476 // See comment just above. If not checking JNI calls the JNI count is only 1477 // needed for assertion checking. 1478 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1479 1480 __ bind(L_skip_vthread_code); 1481 } 1482 #endif 1483 1484 __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset())); 1485 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx); 1486 1487 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset())); 1488 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx); 1489 __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1490 } 1491 1492 static void gen_continuation_enter(MacroAssembler* masm, 1493 const VMRegPair* regs, 1494 int& exception_offset, 1495 OopMapSet* oop_maps, 1496 int& frame_complete, 1497 int& stack_slots, 1498 int& interpreted_entry_offset, 1499 int& compiled_entry_offset) { 1500 1501 // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread) 1502 int pos_cont_obj = 0; 1503 int pos_is_cont = 1; 1504 int pos_is_virtual = 2; 1505 1506 // The platform-specific calling convention may present the arguments in various registers. 1507 // To simplify the rest of the code, we expect the arguments to reside at these known 1508 // registers, and we additionally check the placement here in case calling convention ever 1509 // changes. 1510 Register reg_cont_obj = c_rarg1; 1511 Register reg_is_cont = c_rarg2; 1512 Register reg_is_virtual = c_rarg3; 1513 1514 check_continuation_enter_argument(regs[pos_cont_obj].first(), reg_cont_obj, "Continuation object"); 1515 check_continuation_enter_argument(regs[pos_is_cont].first(), reg_is_cont, "isContinue"); 1516 check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread"); 1517 1518 // Utility methods kill rax, make sure there are no collisions 1519 assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual); 1520 1521 AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(), 1522 relocInfo::static_call_type); 1523 1524 address start = __ pc(); 1525 1526 Label L_thaw, L_exit; 1527 1528 // i2i entry used at interp_only_mode only 1529 interpreted_entry_offset = __ pc() - start; 1530 { 1531 #ifdef ASSERT 1532 Label is_interp_only; 1533 __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0); 1534 __ jcc(Assembler::notEqual, is_interp_only); 1535 __ stop("enterSpecial interpreter entry called when not in interp_only_mode"); 1536 __ bind(is_interp_only); 1537 #endif 1538 1539 __ pop(rax); // return address 1540 // Read interpreter arguments into registers (this is an ad-hoc i2c adapter) 1541 __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2)); 1542 __ movl(c_rarg2, Address(rsp, Interpreter::stackElementSize*1)); 1543 __ movl(c_rarg3, Address(rsp, Interpreter::stackElementSize*0)); 1544 __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment 1545 __ push(rax); // return address 1546 __ push_cont_fastpath(); 1547 1548 __ enter(); 1549 1550 stack_slots = 2; // will be adjusted in setup 1551 OopMap* map = continuation_enter_setup(masm, stack_slots); 1552 // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe, 1553 // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway. 1554 1555 __ verify_oop(reg_cont_obj); 1556 1557 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1558 1559 // If continuation, call to thaw. Otherwise, resolve the call and exit. 1560 __ testptr(reg_is_cont, reg_is_cont); 1561 __ jcc(Assembler::notZero, L_thaw); 1562 1563 // --- Resolve path 1564 1565 // Make sure the call is patchable 1566 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1567 // Emit stub for static call 1568 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc()); 1569 if (stub == nullptr) { 1570 fatal("CodeCache is full at gen_continuation_enter"); 1571 } 1572 __ call(resolve); 1573 oop_maps->add_gc_map(__ pc() - start, map); 1574 __ post_call_nop(); 1575 1576 __ jmp(L_exit); 1577 } 1578 1579 // compiled entry 1580 __ align(CodeEntryAlignment); 1581 compiled_entry_offset = __ pc() - start; 1582 __ enter(); 1583 1584 stack_slots = 2; // will be adjusted in setup 1585 OopMap* map = continuation_enter_setup(masm, stack_slots); 1586 1587 // Frame is now completed as far as size and linkage. 1588 frame_complete = __ pc() - start; 1589 1590 __ verify_oop(reg_cont_obj); 1591 1592 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1593 1594 // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue) 1595 __ testptr(reg_is_cont, reg_is_cont); 1596 __ jccb(Assembler::notZero, L_thaw); 1597 1598 // --- call Continuation.enter(Continuation c, boolean isContinue) 1599 1600 // Make sure the call is patchable 1601 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1602 1603 // Emit stub for static call 1604 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc()); 1605 if (stub == nullptr) { 1606 fatal("CodeCache is full at gen_continuation_enter"); 1607 } 1608 1609 // The call needs to be resolved. There's a special case for this in 1610 // SharedRuntime::find_callee_info_helper() which calls 1611 // LinkResolver::resolve_continuation_enter() which resolves the call to 1612 // Continuation.enter(Continuation c, boolean isContinue). 1613 __ call(resolve); 1614 1615 oop_maps->add_gc_map(__ pc() - start, map); 1616 __ post_call_nop(); 1617 1618 __ jmpb(L_exit); 1619 1620 // --- Thawing path 1621 1622 __ bind(L_thaw); 1623 1624 __ call(RuntimeAddress(StubRoutines::cont_thaw())); 1625 1626 ContinuationEntry::_return_pc_offset = __ pc() - start; 1627 oop_maps->add_gc_map(__ pc() - start, map->deep_copy()); 1628 __ post_call_nop(); 1629 1630 // --- Normal exit (resolve/thawing) 1631 1632 __ bind(L_exit); 1633 1634 continuation_enter_cleanup(masm); 1635 __ pop(rbp); 1636 __ ret(0); 1637 1638 // --- Exception handling path 1639 1640 exception_offset = __ pc() - start; 1641 1642 continuation_enter_cleanup(masm); 1643 __ pop(rbp); 1644 1645 __ movptr(c_rarg0, r15_thread); 1646 __ movptr(c_rarg1, Address(rsp, 0)); // return address 1647 1648 // rax still holds the original exception oop, save it before the call 1649 __ push(rax); 1650 1651 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2); 1652 __ movptr(rbx, rax); 1653 1654 // Continue at exception handler: 1655 // rax: exception oop 1656 // rbx: exception handler 1657 // rdx: exception pc 1658 __ pop(rax); 1659 __ verify_oop(rax); 1660 __ pop(rdx); 1661 __ jmp(rbx); 1662 } 1663 1664 static void gen_continuation_yield(MacroAssembler* masm, 1665 const VMRegPair* regs, 1666 OopMapSet* oop_maps, 1667 int& frame_complete, 1668 int& stack_slots, 1669 int& compiled_entry_offset) { 1670 enum layout { 1671 rbp_off, 1672 rbpH_off, 1673 return_off, 1674 return_off2, 1675 framesize // inclusive of return address 1676 }; 1677 stack_slots = framesize / VMRegImpl::slots_per_word; 1678 assert(stack_slots == 2, "recheck layout"); 1679 1680 address start = __ pc(); 1681 compiled_entry_offset = __ pc() - start; 1682 __ enter(); 1683 address the_pc = __ pc(); 1684 1685 frame_complete = the_pc - start; 1686 1687 // This nop must be exactly at the PC we push into the frame info. 1688 // We use this nop for fast CodeBlob lookup, associate the OopMap 1689 // with it right away. 1690 __ post_call_nop(); 1691 OopMap* map = new OopMap(framesize, 1); 1692 oop_maps->add_gc_map(frame_complete, map); 1693 1694 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 1695 __ movptr(c_rarg0, r15_thread); 1696 __ movptr(c_rarg1, rsp); 1697 __ call_VM_leaf(Continuation::freeze_entry(), 2); 1698 __ reset_last_Java_frame(true); 1699 1700 Label L_pinned; 1701 1702 __ testptr(rax, rax); 1703 __ jcc(Assembler::notZero, L_pinned); 1704 1705 __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1706 continuation_enter_cleanup(masm); 1707 __ pop(rbp); 1708 __ ret(0); 1709 1710 __ bind(L_pinned); 1711 1712 // Pinned, return to caller 1713 1714 // handle pending exception thrown by freeze 1715 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 1716 Label ok; 1717 __ jcc(Assembler::equal, ok); 1718 __ leave(); 1719 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 1720 __ bind(ok); 1721 1722 __ leave(); 1723 __ ret(0); 1724 } 1725 1726 static void gen_special_dispatch(MacroAssembler* masm, 1727 const methodHandle& method, 1728 const BasicType* sig_bt, 1729 const VMRegPair* regs) { 1730 verify_oop_args(masm, method, sig_bt, regs); 1731 vmIntrinsics::ID iid = method->intrinsic_id(); 1732 1733 // Now write the args into the outgoing interpreter space 1734 bool has_receiver = false; 1735 Register receiver_reg = noreg; 1736 int member_arg_pos = -1; 1737 Register member_reg = noreg; 1738 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid); 1739 if (ref_kind != 0) { 1740 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument 1741 member_reg = rbx; // known to be free at this point 1742 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind); 1743 } else if (iid == vmIntrinsics::_invokeBasic) { 1744 has_receiver = true; 1745 } else if (iid == vmIntrinsics::_linkToNative) { 1746 member_arg_pos = method->size_of_parameters() - 1; // trailing NativeEntryPoint argument 1747 member_reg = rbx; // known to be free at this point 1748 } else { 1749 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid)); 1750 } 1751 1752 if (member_reg != noreg) { 1753 // Load the member_arg into register, if necessary. 1754 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs); 1755 VMReg r = regs[member_arg_pos].first(); 1756 if (r->is_stack()) { 1757 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1758 } else { 1759 // no data motion is needed 1760 member_reg = r->as_Register(); 1761 } 1762 } 1763 1764 if (has_receiver) { 1765 // Make sure the receiver is loaded into a register. 1766 assert(method->size_of_parameters() > 0, "oob"); 1767 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object"); 1768 VMReg r = regs[0].first(); 1769 assert(r->is_valid(), "bad receiver arg"); 1770 if (r->is_stack()) { 1771 // Porting note: This assumes that compiled calling conventions always 1772 // pass the receiver oop in a register. If this is not true on some 1773 // platform, pick a temp and load the receiver from stack. 1774 fatal("receiver always in a register"); 1775 receiver_reg = j_rarg0; // known to be free at this point 1776 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1777 } else { 1778 // no data motion is needed 1779 receiver_reg = r->as_Register(); 1780 } 1781 } 1782 1783 // Figure out which address we are really jumping to: 1784 MethodHandles::generate_method_handle_dispatch(masm, iid, 1785 receiver_reg, member_reg, /*for_compiler_entry:*/ true); 1786 } 1787 1788 // --------------------------------------------------------------------------- 1789 // Generate a native wrapper for a given method. The method takes arguments 1790 // in the Java compiled code convention, marshals them to the native 1791 // convention (handlizes oops, etc), transitions to native, makes the call, 1792 // returns to java state (possibly blocking), unhandlizes any result and 1793 // returns. 1794 // 1795 // Critical native functions are a shorthand for the use of 1796 // GetPrimtiveArrayCritical and disallow the use of any other JNI 1797 // functions. The wrapper is expected to unpack the arguments before 1798 // passing them to the callee. Critical native functions leave the state _in_Java, 1799 // since they cannot stop for GC. 1800 // Some other parts of JNI setup are skipped like the tear down of the JNI handle 1801 // block and the check for pending exceptions it's impossible for them 1802 // to be thrown. 1803 // 1804 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm, 1805 const methodHandle& method, 1806 int compile_id, 1807 BasicType* in_sig_bt, 1808 VMRegPair* in_regs, 1809 BasicType ret_type) { 1810 if (method->is_continuation_native_intrinsic()) { 1811 int exception_offset = -1; 1812 OopMapSet* oop_maps = new OopMapSet(); 1813 int frame_complete = -1; 1814 int stack_slots = -1; 1815 int interpreted_entry_offset = -1; 1816 int vep_offset = -1; 1817 if (method->is_continuation_enter_intrinsic()) { 1818 gen_continuation_enter(masm, 1819 in_regs, 1820 exception_offset, 1821 oop_maps, 1822 frame_complete, 1823 stack_slots, 1824 interpreted_entry_offset, 1825 vep_offset); 1826 } else if (method->is_continuation_yield_intrinsic()) { 1827 gen_continuation_yield(masm, 1828 in_regs, 1829 oop_maps, 1830 frame_complete, 1831 stack_slots, 1832 vep_offset); 1833 } else { 1834 guarantee(false, "Unknown Continuation native intrinsic"); 1835 } 1836 1837 #ifdef ASSERT 1838 if (method->is_continuation_enter_intrinsic()) { 1839 assert(interpreted_entry_offset != -1, "Must be set"); 1840 assert(exception_offset != -1, "Must be set"); 1841 } else { 1842 assert(interpreted_entry_offset == -1, "Must be unset"); 1843 assert(exception_offset == -1, "Must be unset"); 1844 } 1845 assert(frame_complete != -1, "Must be set"); 1846 assert(stack_slots != -1, "Must be set"); 1847 assert(vep_offset != -1, "Must be set"); 1848 #endif 1849 1850 __ flush(); 1851 nmethod* nm = nmethod::new_native_nmethod(method, 1852 compile_id, 1853 masm->code(), 1854 vep_offset, 1855 frame_complete, 1856 stack_slots, 1857 in_ByteSize(-1), 1858 in_ByteSize(-1), 1859 oop_maps, 1860 exception_offset); 1861 if (nm == nullptr) return nm; 1862 if (method->is_continuation_enter_intrinsic()) { 1863 ContinuationEntry::set_enter_code(nm, interpreted_entry_offset); 1864 } else if (method->is_continuation_yield_intrinsic()) { 1865 _cont_doYield_stub = nm; 1866 } 1867 return nm; 1868 } 1869 1870 if (method->is_method_handle_intrinsic()) { 1871 vmIntrinsics::ID iid = method->intrinsic_id(); 1872 intptr_t start = (intptr_t)__ pc(); 1873 int vep_offset = ((intptr_t)__ pc()) - start; 1874 gen_special_dispatch(masm, 1875 method, 1876 in_sig_bt, 1877 in_regs); 1878 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period 1879 __ flush(); 1880 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually 1881 return nmethod::new_native_nmethod(method, 1882 compile_id, 1883 masm->code(), 1884 vep_offset, 1885 frame_complete, 1886 stack_slots / VMRegImpl::slots_per_word, 1887 in_ByteSize(-1), 1888 in_ByteSize(-1), 1889 nullptr); 1890 } 1891 address native_func = method->native_function(); 1892 assert(native_func != nullptr, "must have function"); 1893 1894 // An OopMap for lock (and class if static) 1895 OopMapSet *oop_maps = new OopMapSet(); 1896 intptr_t start = (intptr_t)__ pc(); 1897 1898 // We have received a description of where all the java arg are located 1899 // on entry to the wrapper. We need to convert these args to where 1900 // the jni function will expect them. To figure out where they go 1901 // we convert the java signature to a C signature by inserting 1902 // the hidden arguments as arg[0] and possibly arg[1] (static method) 1903 1904 const int total_in_args = method->size_of_parameters(); 1905 int total_c_args = total_in_args + (method->is_static() ? 2 : 1); 1906 1907 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args); 1908 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args); 1909 BasicType* in_elem_bt = nullptr; 1910 1911 int argc = 0; 1912 out_sig_bt[argc++] = T_ADDRESS; 1913 if (method->is_static()) { 1914 out_sig_bt[argc++] = T_OBJECT; 1915 } 1916 1917 for (int i = 0; i < total_in_args ; i++ ) { 1918 out_sig_bt[argc++] = in_sig_bt[i]; 1919 } 1920 1921 // Now figure out where the args must be stored and how much stack space 1922 // they require. 1923 int out_arg_slots; 1924 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args); 1925 1926 // Compute framesize for the wrapper. We need to handlize all oops in 1927 // incoming registers 1928 1929 // Calculate the total number of stack slots we will need. 1930 1931 // First count the abi requirement plus all of the outgoing args 1932 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots; 1933 1934 // Now the space for the inbound oop handle area 1935 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers 1936 1937 int oop_handle_offset = stack_slots; 1938 stack_slots += total_save_slots; 1939 1940 // Now any space we need for handlizing a klass if static method 1941 1942 int klass_slot_offset = 0; 1943 int klass_offset = -1; 1944 int lock_slot_offset = 0; 1945 bool is_static = false; 1946 1947 if (method->is_static()) { 1948 klass_slot_offset = stack_slots; 1949 stack_slots += VMRegImpl::slots_per_word; 1950 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size; 1951 is_static = true; 1952 } 1953 1954 // Plus a lock if needed 1955 1956 if (method->is_synchronized()) { 1957 lock_slot_offset = stack_slots; 1958 stack_slots += VMRegImpl::slots_per_word; 1959 } 1960 1961 // Now a place (+2) to save return values or temp during shuffling 1962 // + 4 for return address (which we own) and saved rbp 1963 stack_slots += 6; 1964 1965 // Ok The space we have allocated will look like: 1966 // 1967 // 1968 // FP-> | | 1969 // |---------------------| 1970 // | 2 slots for moves | 1971 // |---------------------| 1972 // | lock box (if sync) | 1973 // |---------------------| <- lock_slot_offset 1974 // | klass (if static) | 1975 // |---------------------| <- klass_slot_offset 1976 // | oopHandle area | 1977 // |---------------------| <- oop_handle_offset (6 java arg registers) 1978 // | outbound memory | 1979 // | based arguments | 1980 // | | 1981 // |---------------------| 1982 // | | 1983 // SP-> | out_preserved_slots | 1984 // 1985 // 1986 1987 1988 // Now compute actual number of stack words we need rounding to make 1989 // stack properly aligned. 1990 stack_slots = align_up(stack_slots, StackAlignmentInSlots); 1991 1992 int stack_size = stack_slots * VMRegImpl::stack_slot_size; 1993 1994 // First thing make an ic check to see if we should even be here 1995 1996 // We are free to use all registers as temps without saving them and 1997 // restoring them except rbp. rbp is the only callee save register 1998 // as far as the interpreter and the compiler(s) are concerned. 1999 2000 const Register receiver = j_rarg0; 2001 2002 Label exception_pending; 2003 2004 assert_different_registers(receiver, rscratch1, rscratch2); 2005 __ verify_oop(receiver); 2006 __ ic_check(8 /* end_alignment */); 2007 2008 int vep_offset = ((intptr_t)__ pc()) - start; 2009 2010 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) { 2011 Label L_skip_barrier; 2012 Register klass = r10; 2013 __ mov_metadata(klass, method->method_holder()); // InstanceKlass* 2014 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 2015 2016 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 2017 2018 __ bind(L_skip_barrier); 2019 } 2020 2021 #ifdef COMPILER1 2022 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available. 2023 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) { 2024 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/); 2025 } 2026 #endif // COMPILER1 2027 2028 // The instruction at the verified entry point must be 5 bytes or longer 2029 // because it can be patched on the fly by make_non_entrant. The stack bang 2030 // instruction fits that requirement. 2031 2032 // Generate stack overflow check 2033 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size()); 2034 2035 // Generate a new frame for the wrapper. 2036 __ enter(); 2037 // -2 because return address is already present and so is saved rbp 2038 __ subptr(rsp, stack_size - 2*wordSize); 2039 2040 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2041 // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub 2042 bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */); 2043 2044 // Frame is now completed as far as size and linkage. 2045 int frame_complete = ((intptr_t)__ pc()) - start; 2046 2047 #ifdef ASSERT 2048 __ check_stack_alignment(rsp, "improperly aligned stack"); 2049 #endif /* ASSERT */ 2050 2051 2052 // We use r14 as the oop handle for the receiver/klass 2053 // It is callee save so it survives the call to native 2054 2055 const Register oop_handle_reg = r14; 2056 2057 // 2058 // We immediately shuffle the arguments so that any vm call we have to 2059 // make from here on out (sync slow path, jvmti, etc.) we will have 2060 // captured the oops from our caller and have a valid oopMap for 2061 // them. 2062 2063 // ----------------- 2064 // The Grand Shuffle 2065 2066 // The Java calling convention is either equal (linux) or denser (win64) than the 2067 // c calling convention. However the because of the jni_env argument the c calling 2068 // convention always has at least one more (and two for static) arguments than Java. 2069 // Therefore if we move the args from java -> c backwards then we will never have 2070 // a register->register conflict and we don't have to build a dependency graph 2071 // and figure out how to break any cycles. 2072 // 2073 2074 // Record esp-based slot for receiver on stack for non-static methods 2075 int receiver_offset = -1; 2076 2077 // This is a trick. We double the stack slots so we can claim 2078 // the oops in the caller's frame. Since we are sure to have 2079 // more args than the caller doubling is enough to make 2080 // sure we can capture all the incoming oop args from the 2081 // caller. 2082 // 2083 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/); 2084 2085 // Mark location of rbp (someday) 2086 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp)); 2087 2088 // Use eax, ebx as temporaries during any memory-memory moves we have to do 2089 // All inbound args are referenced based on rbp and all outbound args via rsp. 2090 2091 2092 #ifdef ASSERT 2093 bool reg_destroyed[Register::number_of_registers]; 2094 bool freg_destroyed[XMMRegister::number_of_registers]; 2095 for ( int r = 0 ; r < Register::number_of_registers ; r++ ) { 2096 reg_destroyed[r] = false; 2097 } 2098 for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) { 2099 freg_destroyed[f] = false; 2100 } 2101 2102 #endif /* ASSERT */ 2103 2104 // For JNI natives the incoming and outgoing registers are offset upwards. 2105 GrowableArray<int> arg_order(2 * total_in_args); 2106 2107 VMRegPair tmp_vmreg; 2108 tmp_vmreg.set2(rbx->as_VMReg()); 2109 2110 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 2111 arg_order.push(i); 2112 arg_order.push(c_arg); 2113 } 2114 2115 int temploc = -1; 2116 for (int ai = 0; ai < arg_order.length(); ai += 2) { 2117 int i = arg_order.at(ai); 2118 int c_arg = arg_order.at(ai + 1); 2119 __ block_comment(err_msg("move %d -> %d", i, c_arg)); 2120 #ifdef ASSERT 2121 if (in_regs[i].first()->is_Register()) { 2122 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!"); 2123 } else if (in_regs[i].first()->is_XMMRegister()) { 2124 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!"); 2125 } 2126 if (out_regs[c_arg].first()->is_Register()) { 2127 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 2128 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 2129 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 2130 } 2131 #endif /* ASSERT */ 2132 switch (in_sig_bt[i]) { 2133 case T_ARRAY: 2134 case T_OBJECT: 2135 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg], 2136 ((i == 0) && (!is_static)), 2137 &receiver_offset); 2138 break; 2139 case T_VOID: 2140 break; 2141 2142 case T_FLOAT: 2143 __ float_move(in_regs[i], out_regs[c_arg]); 2144 break; 2145 2146 case T_DOUBLE: 2147 assert( i + 1 < total_in_args && 2148 in_sig_bt[i + 1] == T_VOID && 2149 out_sig_bt[c_arg+1] == T_VOID, "bad arg list"); 2150 __ double_move(in_regs[i], out_regs[c_arg]); 2151 break; 2152 2153 case T_LONG : 2154 __ long_move(in_regs[i], out_regs[c_arg]); 2155 break; 2156 2157 case T_ADDRESS: assert(false, "found T_ADDRESS in java args"); 2158 2159 default: 2160 __ move32_64(in_regs[i], out_regs[c_arg]); 2161 } 2162 } 2163 2164 int c_arg; 2165 2166 // Pre-load a static method's oop into r14. Used both by locking code and 2167 // the normal JNI call code. 2168 // point c_arg at the first arg that is already loaded in case we 2169 // need to spill before we call out 2170 c_arg = total_c_args - total_in_args; 2171 2172 if (method->is_static()) { 2173 2174 // load oop into a register 2175 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror())); 2176 2177 // Now handlize the static class mirror it's known not-null. 2178 __ movptr(Address(rsp, klass_offset), oop_handle_reg); 2179 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset)); 2180 2181 // Now get the handle 2182 __ lea(oop_handle_reg, Address(rsp, klass_offset)); 2183 // store the klass handle as second argument 2184 __ movptr(c_rarg1, oop_handle_reg); 2185 // and protect the arg if we must spill 2186 c_arg--; 2187 } 2188 2189 // Change state to native (we save the return address in the thread, since it might not 2190 // be pushed on the stack when we do a stack traversal). It is enough that the pc() 2191 // points into the right code segment. It does not have to be the correct return pc. 2192 // We use the same pc/oopMap repeatedly when we call out 2193 2194 intptr_t the_pc = (intptr_t) __ pc(); 2195 oop_maps->add_gc_map(the_pc - start, map); 2196 2197 __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1); 2198 2199 2200 // We have all of the arguments setup at this point. We must not touch any register 2201 // argument registers at this point (what if we save/restore them there are no oop? 2202 2203 if (DTraceMethodProbes) { 2204 // protect the args we've loaded 2205 save_args(masm, total_c_args, c_arg, out_regs); 2206 __ mov_metadata(c_rarg1, method()); 2207 __ call_VM_leaf( 2208 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry), 2209 r15_thread, c_rarg1); 2210 restore_args(masm, total_c_args, c_arg, out_regs); 2211 } 2212 2213 // RedefineClasses() tracing support for obsolete method entry 2214 if (log_is_enabled(Trace, redefine, class, obsolete)) { 2215 // protect the args we've loaded 2216 save_args(masm, total_c_args, c_arg, out_regs); 2217 __ mov_metadata(c_rarg1, method()); 2218 __ call_VM_leaf( 2219 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry), 2220 r15_thread, c_rarg1); 2221 restore_args(masm, total_c_args, c_arg, out_regs); 2222 } 2223 2224 // Lock a synchronized method 2225 2226 // Register definitions used by locking and unlocking 2227 2228 const Register swap_reg = rax; // Must use rax for cmpxchg instruction 2229 const Register obj_reg = rbx; // Will contain the oop 2230 const Register lock_reg = r13; // Address of compiler lock object (BasicLock) 2231 const Register old_hdr = r13; // value of old header at unlock time 2232 2233 Label slow_path_lock; 2234 Label lock_done; 2235 2236 if (method->is_synchronized()) { 2237 Label count_mon; 2238 2239 const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes(); 2240 2241 // Get the handle (the 2nd argument) 2242 __ mov(oop_handle_reg, c_rarg1); 2243 2244 // Get address of the box 2245 2246 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2247 2248 // Load the oop from the handle 2249 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2250 2251 if (LockingMode == LM_MONITOR) { 2252 __ jmp(slow_path_lock); 2253 } else if (LockingMode == LM_LEGACY) { 2254 // Load immediate 1 into swap_reg %rax 2255 __ movl(swap_reg, 1); 2256 2257 // Load (object->mark() | 1) into swap_reg %rax 2258 __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2259 2260 // Save (object->mark() | 1) into BasicLock's displaced header 2261 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2262 2263 // src -> dest iff dest == rax else rax <- dest 2264 __ lock(); 2265 __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2266 __ jcc(Assembler::equal, count_mon); 2267 2268 // Hmm should this move to the slow path code area??? 2269 2270 // Test if the oopMark is an obvious stack pointer, i.e., 2271 // 1) (mark & 3) == 0, and 2272 // 2) rsp <= mark < mark + os::pagesize() 2273 // These 3 tests can be done by evaluating the following 2274 // expression: ((mark - rsp) & (3 - os::vm_page_size())), 2275 // assuming both stack pointer and pagesize have their 2276 // least significant 2 bits clear. 2277 // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg 2278 2279 __ subptr(swap_reg, rsp); 2280 __ andptr(swap_reg, 3 - (int)os::vm_page_size()); 2281 2282 // Save the test result, for recursive case, the result is zero 2283 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2284 __ jcc(Assembler::notEqual, slow_path_lock); 2285 } else { 2286 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2287 __ lightweight_lock(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock); 2288 } 2289 __ bind(count_mon); 2290 __ inc_held_monitor_count(); 2291 2292 // Slow path will re-enter here 2293 __ bind(lock_done); 2294 } 2295 2296 // Finally just about ready to make the JNI call 2297 2298 // get JNIEnv* which is first argument to native 2299 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset()))); 2300 2301 // Now set thread in native 2302 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 2303 2304 __ call(RuntimeAddress(native_func)); 2305 2306 // Verify or restore cpu control state after JNI call 2307 __ restore_cpu_control_state_after_jni(rscratch1); 2308 2309 // Unpack native results. 2310 switch (ret_type) { 2311 case T_BOOLEAN: __ c2bool(rax); break; 2312 case T_CHAR : __ movzwl(rax, rax); break; 2313 case T_BYTE : __ sign_extend_byte (rax); break; 2314 case T_SHORT : __ sign_extend_short(rax); break; 2315 case T_INT : /* nothing to do */ break; 2316 case T_DOUBLE : 2317 case T_FLOAT : 2318 // Result is in xmm0 we'll save as needed 2319 break; 2320 case T_ARRAY: // Really a handle 2321 case T_OBJECT: // Really a handle 2322 break; // can't de-handlize until after safepoint check 2323 case T_VOID: break; 2324 case T_LONG: break; 2325 default : ShouldNotReachHere(); 2326 } 2327 2328 Label after_transition; 2329 2330 // Switch thread to "native transition" state before reading the synchronization state. 2331 // This additional state is necessary because reading and testing the synchronization 2332 // state is not atomic w.r.t. GC, as this scenario demonstrates: 2333 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted. 2334 // VM thread changes sync state to synchronizing and suspends threads for GC. 2335 // Thread A is resumed to finish this native method, but doesn't block here since it 2336 // didn't see any synchronization is progress, and escapes. 2337 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 2338 2339 // Force this write out before the read below 2340 if (!UseSystemMemoryBarrier) { 2341 __ membar(Assembler::Membar_mask_bits( 2342 Assembler::LoadLoad | Assembler::LoadStore | 2343 Assembler::StoreLoad | Assembler::StoreStore)); 2344 } 2345 2346 // check for safepoint operation in progress and/or pending suspend requests 2347 { 2348 Label Continue; 2349 Label slow_path; 2350 2351 __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */); 2352 2353 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2354 __ jcc(Assembler::equal, Continue); 2355 __ bind(slow_path); 2356 2357 // Don't use call_VM as it will see a possible pending exception and forward it 2358 // and never return here preventing us from clearing _last_native_pc down below. 2359 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are 2360 // preserved and correspond to the bcp/locals pointers. So we do a runtime call 2361 // by hand. 2362 // 2363 __ vzeroupper(); 2364 save_native_result(masm, ret_type, stack_slots); 2365 __ mov(c_rarg0, r15_thread); 2366 __ mov(r12, rsp); // remember sp 2367 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2368 __ andptr(rsp, -16); // align stack as required by ABI 2369 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 2370 __ mov(rsp, r12); // restore sp 2371 __ reinit_heapbase(); 2372 // Restore any method result value 2373 restore_native_result(masm, ret_type, stack_slots); 2374 __ bind(Continue); 2375 } 2376 2377 // change thread state 2378 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 2379 __ bind(after_transition); 2380 2381 Label reguard; 2382 Label reguard_done; 2383 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 2384 __ jcc(Assembler::equal, reguard); 2385 __ bind(reguard_done); 2386 2387 // native result if any is live 2388 2389 // Unlock 2390 Label slow_path_unlock; 2391 Label unlock_done; 2392 if (method->is_synchronized()) { 2393 2394 Label fast_done; 2395 2396 // Get locked oop from the handle we passed to jni 2397 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2398 2399 if (LockingMode == LM_LEGACY) { 2400 Label not_recur; 2401 // Simple recursive lock? 2402 __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD); 2403 __ jcc(Assembler::notEqual, not_recur); 2404 __ dec_held_monitor_count(); 2405 __ jmpb(fast_done); 2406 __ bind(not_recur); 2407 } 2408 2409 // Must save rax if it is live now because cmpxchg must use it 2410 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2411 save_native_result(masm, ret_type, stack_slots); 2412 } 2413 2414 if (LockingMode == LM_MONITOR) { 2415 __ jmp(slow_path_unlock); 2416 } else if (LockingMode == LM_LEGACY) { 2417 // get address of the stack lock 2418 __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2419 // get old displaced header 2420 __ movptr(old_hdr, Address(rax, 0)); 2421 2422 // Atomic swap old header if oop still contains the stack lock 2423 __ lock(); 2424 __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2425 __ jcc(Assembler::notEqual, slow_path_unlock); 2426 __ dec_held_monitor_count(); 2427 } else { 2428 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2429 __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock); 2430 __ dec_held_monitor_count(); 2431 } 2432 2433 // slow path re-enters here 2434 __ bind(unlock_done); 2435 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2436 restore_native_result(masm, ret_type, stack_slots); 2437 } 2438 2439 __ bind(fast_done); 2440 } 2441 if (DTraceMethodProbes) { 2442 save_native_result(masm, ret_type, stack_slots); 2443 __ mov_metadata(c_rarg1, method()); 2444 __ call_VM_leaf( 2445 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit), 2446 r15_thread, c_rarg1); 2447 restore_native_result(masm, ret_type, stack_slots); 2448 } 2449 2450 __ reset_last_Java_frame(false); 2451 2452 // Unbox oop result, e.g. JNIHandles::resolve value. 2453 if (is_reference_type(ret_type)) { 2454 __ resolve_jobject(rax /* value */, 2455 r15_thread /* thread */, 2456 rcx /* tmp */); 2457 } 2458 2459 if (CheckJNICalls) { 2460 // clear_pending_jni_exception_check 2461 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD); 2462 } 2463 2464 // reset handle block 2465 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset())); 2466 __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD); 2467 2468 // pop our frame 2469 2470 __ leave(); 2471 2472 // Any exception pending? 2473 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2474 __ jcc(Assembler::notEqual, exception_pending); 2475 2476 // Return 2477 2478 __ ret(0); 2479 2480 // Unexpected paths are out of line and go here 2481 2482 // forward the exception 2483 __ bind(exception_pending); 2484 2485 // and forward the exception 2486 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 2487 2488 // Slow path locking & unlocking 2489 if (method->is_synchronized()) { 2490 2491 // BEGIN Slow path lock 2492 __ bind(slow_path_lock); 2493 2494 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM 2495 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2496 2497 // protect the args we've loaded 2498 save_args(masm, total_c_args, c_arg, out_regs); 2499 2500 __ mov(c_rarg0, obj_reg); 2501 __ mov(c_rarg1, lock_reg); 2502 __ mov(c_rarg2, r15_thread); 2503 2504 // Not a leaf but we have last_Java_frame setup as we want 2505 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3); 2506 restore_args(masm, total_c_args, c_arg, out_regs); 2507 2508 #ifdef ASSERT 2509 { Label L; 2510 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2511 __ jcc(Assembler::equal, L); 2512 __ stop("no pending exception allowed on exit from monitorenter"); 2513 __ bind(L); 2514 } 2515 #endif 2516 __ jmp(lock_done); 2517 2518 // END Slow path lock 2519 2520 // BEGIN Slow path unlock 2521 __ bind(slow_path_unlock); 2522 2523 // If we haven't already saved the native result we must save it now as xmm registers 2524 // are still exposed. 2525 __ vzeroupper(); 2526 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2527 save_native_result(masm, ret_type, stack_slots); 2528 } 2529 2530 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2531 2532 __ mov(c_rarg0, obj_reg); 2533 __ mov(c_rarg2, r15_thread); 2534 __ mov(r12, rsp); // remember sp 2535 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2536 __ andptr(rsp, -16); // align stack as required by ABI 2537 2538 // Save pending exception around call to VM (which contains an EXCEPTION_MARK) 2539 // NOTE that obj_reg == rbx currently 2540 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset()))); 2541 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2542 2543 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2544 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C))); 2545 __ mov(rsp, r12); // restore sp 2546 __ reinit_heapbase(); 2547 #ifdef ASSERT 2548 { 2549 Label L; 2550 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2551 __ jcc(Assembler::equal, L); 2552 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C"); 2553 __ bind(L); 2554 } 2555 #endif /* ASSERT */ 2556 2557 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx); 2558 2559 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2560 restore_native_result(masm, ret_type, stack_slots); 2561 } 2562 __ jmp(unlock_done); 2563 2564 // END Slow path unlock 2565 2566 } // synchronized 2567 2568 // SLOW PATH Reguard the stack if needed 2569 2570 __ bind(reguard); 2571 __ vzeroupper(); 2572 save_native_result(masm, ret_type, stack_slots); 2573 __ mov(r12, rsp); // remember sp 2574 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2575 __ andptr(rsp, -16); // align stack as required by ABI 2576 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 2577 __ mov(rsp, r12); // restore sp 2578 __ reinit_heapbase(); 2579 restore_native_result(masm, ret_type, stack_slots); 2580 // and continue 2581 __ jmp(reguard_done); 2582 2583 2584 2585 __ flush(); 2586 2587 nmethod *nm = nmethod::new_native_nmethod(method, 2588 compile_id, 2589 masm->code(), 2590 vep_offset, 2591 frame_complete, 2592 stack_slots / VMRegImpl::slots_per_word, 2593 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)), 2594 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size), 2595 oop_maps); 2596 2597 return nm; 2598 } 2599 2600 // this function returns the adjust size (in number of words) to a c2i adapter 2601 // activation for use during deoptimization 2602 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) { 2603 return (callee_locals - callee_parameters) * Interpreter::stackElementWords; 2604 } 2605 2606 2607 uint SharedRuntime::out_preserve_stack_slots() { 2608 return 0; 2609 } 2610 2611 2612 // Number of stack slots between incoming argument block and the start of 2613 // a new frame. The PROLOG must add this many slots to the stack. The 2614 // EPILOG must remove this many slots. amd64 needs two slots for 2615 // return address. 2616 uint SharedRuntime::in_preserve_stack_slots() { 2617 return 4 + 2 * VerifyStackAtCalls; 2618 } 2619 2620 //------------------------------generate_deopt_blob---------------------------- 2621 void SharedRuntime::generate_deopt_blob() { 2622 // Allocate space for the code 2623 ResourceMark rm; 2624 // Setup code generation tools 2625 int pad = 0; 2626 if (UseAVX > 2) { 2627 pad += 1024; 2628 } 2629 if (UseAPX) { 2630 pad += 1024; 2631 } 2632 #if INCLUDE_JVMCI 2633 if (EnableJVMCI) { 2634 pad += 512; // Increase the buffer size when compiling for JVMCI 2635 } 2636 #endif 2637 CodeBuffer buffer("deopt_blob", 2560+pad, 1024); 2638 MacroAssembler* masm = new MacroAssembler(&buffer); 2639 int frame_size_in_words; 2640 OopMap* map = nullptr; 2641 OopMapSet *oop_maps = new OopMapSet(); 2642 2643 // ------------- 2644 // This code enters when returning to a de-optimized nmethod. A return 2645 // address has been pushed on the stack, and return values are in 2646 // registers. 2647 // If we are doing a normal deopt then we were called from the patched 2648 // nmethod from the point we returned to the nmethod. So the return 2649 // address on the stack is wrong by NativeCall::instruction_size 2650 // We will adjust the value so it looks like we have the original return 2651 // address on the stack (like when we eagerly deoptimized). 2652 // In the case of an exception pending when deoptimizing, we enter 2653 // with a return address on the stack that points after the call we patched 2654 // into the exception handler. We have the following register state from, 2655 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp). 2656 // rax: exception oop 2657 // rbx: exception handler 2658 // rdx: throwing pc 2659 // So in this case we simply jam rdx into the useless return address and 2660 // the stack looks just like we want. 2661 // 2662 // At this point we need to de-opt. We save the argument return 2663 // registers. We call the first C routine, fetch_unroll_info(). This 2664 // routine captures the return values and returns a structure which 2665 // describes the current frame size and the sizes of all replacement frames. 2666 // The current frame is compiled code and may contain many inlined 2667 // functions, each with their own JVM state. We pop the current frame, then 2668 // push all the new frames. Then we call the C routine unpack_frames() to 2669 // populate these frames. Finally unpack_frames() returns us the new target 2670 // address. Notice that callee-save registers are BLOWN here; they have 2671 // already been captured in the vframeArray at the time the return PC was 2672 // patched. 2673 address start = __ pc(); 2674 Label cont; 2675 2676 // Prolog for non exception case! 2677 2678 // Save everything in sight. 2679 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2680 2681 // Normal deoptimization. Save exec mode for unpack_frames. 2682 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved 2683 __ jmp(cont); 2684 2685 int reexecute_offset = __ pc() - start; 2686 #if INCLUDE_JVMCI && !defined(COMPILER1) 2687 if (EnableJVMCI && UseJVMCICompiler) { 2688 // JVMCI does not use this kind of deoptimization 2689 __ should_not_reach_here(); 2690 } 2691 #endif 2692 2693 // Reexecute case 2694 // return address is the pc describes what bci to do re-execute at 2695 2696 // No need to update map as each call to save_live_registers will produce identical oopmap 2697 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2698 2699 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved 2700 __ jmp(cont); 2701 2702 #if INCLUDE_JVMCI 2703 Label after_fetch_unroll_info_call; 2704 int implicit_exception_uncommon_trap_offset = 0; 2705 int uncommon_trap_offset = 0; 2706 2707 if (EnableJVMCI) { 2708 implicit_exception_uncommon_trap_offset = __ pc() - start; 2709 2710 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset()))); 2711 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD); 2712 2713 uncommon_trap_offset = __ pc() - start; 2714 2715 // Save everything in sight. 2716 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2717 // fetch_unroll_info needs to call last_java_frame() 2718 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2719 2720 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset()))); 2721 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1); 2722 2723 __ movl(r14, Deoptimization::Unpack_reexecute); 2724 __ mov(c_rarg0, r15_thread); 2725 __ movl(c_rarg2, r14); // exec mode 2726 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2727 oop_maps->add_gc_map( __ pc()-start, map->deep_copy()); 2728 2729 __ reset_last_Java_frame(false); 2730 2731 __ jmp(after_fetch_unroll_info_call); 2732 } // EnableJVMCI 2733 #endif // INCLUDE_JVMCI 2734 2735 int exception_offset = __ pc() - start; 2736 2737 // Prolog for exception case 2738 2739 // all registers are dead at this entry point, except for rax, and 2740 // rdx which contain the exception oop and exception pc 2741 // respectively. Set them in TLS and fall thru to the 2742 // unpack_with_exception_in_tls entry point. 2743 2744 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 2745 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax); 2746 2747 int exception_in_tls_offset = __ pc() - start; 2748 2749 // new implementation because exception oop is now passed in JavaThread 2750 2751 // Prolog for exception case 2752 // All registers must be preserved because they might be used by LinearScan 2753 // Exceptiop oop and throwing PC are passed in JavaThread 2754 // tos: stack at point of call to method that threw the exception (i.e. only 2755 // args are on the stack, no return address) 2756 2757 // make room on stack for the return address 2758 // It will be patched later with the throwing pc. The correct value is not 2759 // available now because loading it from memory would destroy registers. 2760 __ push(0); 2761 2762 // Save everything in sight. 2763 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2764 2765 // Now it is safe to overwrite any register 2766 2767 // Deopt during an exception. Save exec mode for unpack_frames. 2768 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved 2769 2770 // load throwing pc from JavaThread and patch it as the return address 2771 // of the current frame. Then clear the field in JavaThread 2772 2773 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2774 __ movptr(Address(rbp, wordSize), rdx); 2775 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2776 2777 #ifdef ASSERT 2778 // verify that there is really an exception oop in JavaThread 2779 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2780 __ verify_oop(rax); 2781 2782 // verify that there is no pending exception 2783 Label no_pending_exception; 2784 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 2785 __ testptr(rax, rax); 2786 __ jcc(Assembler::zero, no_pending_exception); 2787 __ stop("must not have pending exception here"); 2788 __ bind(no_pending_exception); 2789 #endif 2790 2791 __ bind(cont); 2792 2793 // Call C code. Need thread and this frame, but NOT official VM entry 2794 // crud. We cannot block on this call, no GC can happen. 2795 // 2796 // UnrollBlock* fetch_unroll_info(JavaThread* thread) 2797 2798 // fetch_unroll_info needs to call last_java_frame(). 2799 2800 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2801 #ifdef ASSERT 2802 { Label L; 2803 __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD); 2804 __ jcc(Assembler::equal, L); 2805 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared"); 2806 __ bind(L); 2807 } 2808 #endif // ASSERT 2809 __ mov(c_rarg0, r15_thread); 2810 __ movl(c_rarg1, r14); // exec_mode 2811 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info))); 2812 2813 // Need to have an oopmap that tells fetch_unroll_info where to 2814 // find any register it might need. 2815 oop_maps->add_gc_map(__ pc() - start, map); 2816 2817 __ reset_last_Java_frame(false); 2818 2819 #if INCLUDE_JVMCI 2820 if (EnableJVMCI) { 2821 __ bind(after_fetch_unroll_info_call); 2822 } 2823 #endif 2824 2825 // Load UnrollBlock* into rdi 2826 __ mov(rdi, rax); 2827 2828 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset())); 2829 Label noException; 2830 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending? 2831 __ jcc(Assembler::notEqual, noException); 2832 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2833 // QQQ this is useless it was null above 2834 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2835 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD); 2836 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2837 2838 __ verify_oop(rax); 2839 2840 // Overwrite the result registers with the exception results. 2841 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2842 // I think this is useless 2843 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx); 2844 2845 __ bind(noException); 2846 2847 // Only register save data is on the stack. 2848 // Now restore the result registers. Everything else is either dead 2849 // or captured in the vframeArray. 2850 RegisterSaver::restore_result_registers(masm); 2851 2852 // All of the register save area has been popped of the stack. Only the 2853 // return address remains. 2854 2855 // Pop all the frames we must move/replace. 2856 // 2857 // Frame picture (youngest to oldest) 2858 // 1: self-frame (no frame link) 2859 // 2: deopting frame (no frame link) 2860 // 3: caller of deopting frame (could be compiled/interpreted). 2861 // 2862 // Note: by leaving the return address of self-frame on the stack 2863 // and using the size of frame 2 to adjust the stack 2864 // when we are done the return to frame 3 will still be on the stack. 2865 2866 // Pop deoptimized frame 2867 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset())); 2868 __ addptr(rsp, rcx); 2869 2870 // rsp should be pointing at the return address to the caller (3) 2871 2872 // Pick up the initial fp we should save 2873 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2874 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset())); 2875 2876 #ifdef ASSERT 2877 // Compilers generate code that bang the stack by as much as the 2878 // interpreter would need. So this stack banging should never 2879 // trigger a fault. Verify that it does not on non product builds. 2880 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset())); 2881 __ bang_stack_size(rbx, rcx); 2882 #endif 2883 2884 // Load address of array of frame pcs into rcx 2885 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset())); 2886 2887 // Trash the old pc 2888 __ addptr(rsp, wordSize); 2889 2890 // Load address of array of frame sizes into rsi 2891 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset())); 2892 2893 // Load counter into rdx 2894 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset())); 2895 2896 // Now adjust the caller's stack to make up for the extra locals 2897 // but record the original sp so that we can save it in the skeletal interpreter 2898 // frame and the stack walking of interpreter_sender will get the unextended sp 2899 // value and not the "real" sp value. 2900 2901 const Register sender_sp = r8; 2902 2903 __ mov(sender_sp, rsp); 2904 __ movl(rbx, Address(rdi, 2905 Deoptimization::UnrollBlock:: 2906 caller_adjustment_offset())); 2907 __ subptr(rsp, rbx); 2908 2909 // Push interpreter frames in a loop 2910 Label loop; 2911 __ bind(loop); 2912 __ movptr(rbx, Address(rsi, 0)); // Load frame size 2913 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand 2914 __ pushptr(Address(rcx, 0)); // Save return address 2915 __ enter(); // Save old & set new ebp 2916 __ subptr(rsp, rbx); // Prolog 2917 // This value is corrected by layout_activation_impl 2918 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD); 2919 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable 2920 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 2921 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 2922 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 2923 __ decrementl(rdx); // Decrement counter 2924 __ jcc(Assembler::notZero, loop); 2925 __ pushptr(Address(rcx, 0)); // Save final return address 2926 2927 // Re-push self-frame 2928 __ enter(); // Save old & set new ebp 2929 2930 // Allocate a full sized register save area. 2931 // Return address and rbp are in place, so we allocate two less words. 2932 __ subptr(rsp, (frame_size_in_words - 2) * wordSize); 2933 2934 // Restore frame locals after moving the frame 2935 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0); 2936 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2937 2938 // Call C code. Need thread but NOT official VM entry 2939 // crud. We cannot block on this call, no GC can happen. Call should 2940 // restore return values to their stack-slots with the new SP. 2941 // 2942 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode) 2943 2944 // Use rbp because the frames look interpreted now 2945 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 2946 // Don't need the precise return PC here, just precise enough to point into this code blob. 2947 address the_pc = __ pc(); 2948 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1); 2949 2950 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI 2951 __ mov(c_rarg0, r15_thread); 2952 __ movl(c_rarg1, r14); // second arg: exec_mode 2953 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 2954 // Revert SP alignment after call since we're going to do some SP relative addressing below 2955 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset())); 2956 2957 // Set an oopmap for the call site 2958 // Use the same PC we used for the last java frame 2959 oop_maps->add_gc_map(the_pc - start, 2960 new OopMap( frame_size_in_words, 0 )); 2961 2962 // Clear fp AND pc 2963 __ reset_last_Java_frame(true); 2964 2965 // Collect return values 2966 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes())); 2967 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes())); 2968 // I think this is useless (throwing pc?) 2969 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes())); 2970 2971 // Pop self-frame. 2972 __ leave(); // Epilog 2973 2974 // Jump to interpreter 2975 __ ret(0); 2976 2977 // Make sure all code is generated 2978 masm->flush(); 2979 2980 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words); 2981 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset); 2982 #if INCLUDE_JVMCI 2983 if (EnableJVMCI) { 2984 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset); 2985 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset); 2986 } 2987 #endif 2988 } 2989 2990 #ifdef COMPILER2 2991 //------------------------------generate_uncommon_trap_blob-------------------- 2992 void SharedRuntime::generate_uncommon_trap_blob() { 2993 // Allocate space for the code 2994 ResourceMark rm; 2995 // Setup code generation tools 2996 CodeBuffer buffer("uncommon_trap_blob", 2048, 1024); 2997 MacroAssembler* masm = new MacroAssembler(&buffer); 2998 2999 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 3000 3001 address start = __ pc(); 3002 3003 // Push self-frame. We get here with a return address on the 3004 // stack, so rsp is 8-byte aligned until we allocate our frame. 3005 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog! 3006 3007 // No callee saved registers. rbp is assumed implicitly saved 3008 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 3009 3010 // compiler left unloaded_class_index in j_rarg0 move to where the 3011 // runtime expects it. 3012 __ movl(c_rarg1, j_rarg0); 3013 3014 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 3015 3016 // Call C code. Need thread but NOT official VM entry 3017 // crud. We cannot block on this call, no GC can happen. Call should 3018 // capture callee-saved registers as well as return values. 3019 // Thread is in rdi already. 3020 // 3021 // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index); 3022 3023 __ mov(c_rarg0, r15_thread); 3024 __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap); 3025 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 3026 3027 // Set an oopmap for the call site 3028 OopMapSet* oop_maps = new OopMapSet(); 3029 OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0); 3030 3031 // location of rbp is known implicitly by the frame sender code 3032 3033 oop_maps->add_gc_map(__ pc() - start, map); 3034 3035 __ reset_last_Java_frame(false); 3036 3037 // Load UnrollBlock* into rdi 3038 __ mov(rdi, rax); 3039 3040 #ifdef ASSERT 3041 { Label L; 3042 __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()), 3043 Deoptimization::Unpack_uncommon_trap); 3044 __ jcc(Assembler::equal, L); 3045 __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap"); 3046 __ bind(L); 3047 } 3048 #endif 3049 3050 // Pop all the frames we must move/replace. 3051 // 3052 // Frame picture (youngest to oldest) 3053 // 1: self-frame (no frame link) 3054 // 2: deopting frame (no frame link) 3055 // 3: caller of deopting frame (could be compiled/interpreted). 3056 3057 // Pop self-frame. We have no frame, and must rely only on rax and rsp. 3058 __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog! 3059 3060 // Pop deoptimized frame (int) 3061 __ movl(rcx, Address(rdi, 3062 Deoptimization::UnrollBlock:: 3063 size_of_deoptimized_frame_offset())); 3064 __ addptr(rsp, rcx); 3065 3066 // rsp should be pointing at the return address to the caller (3) 3067 3068 // Pick up the initial fp we should save 3069 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 3070 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset())); 3071 3072 #ifdef ASSERT 3073 // Compilers generate code that bang the stack by as much as the 3074 // interpreter would need. So this stack banging should never 3075 // trigger a fault. Verify that it does not on non product builds. 3076 __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset())); 3077 __ bang_stack_size(rbx, rcx); 3078 #endif 3079 3080 // Load address of array of frame pcs into rcx (address*) 3081 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset())); 3082 3083 // Trash the return pc 3084 __ addptr(rsp, wordSize); 3085 3086 // Load address of array of frame sizes into rsi (intptr_t*) 3087 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset())); 3088 3089 // Counter 3090 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset())); // (int) 3091 3092 // Now adjust the caller's stack to make up for the extra locals but 3093 // record the original sp so that we can save it in the skeletal 3094 // interpreter frame and the stack walking of interpreter_sender 3095 // will get the unextended sp value and not the "real" sp value. 3096 3097 const Register sender_sp = r8; 3098 3099 __ mov(sender_sp, rsp); 3100 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset())); // (int) 3101 __ subptr(rsp, rbx); 3102 3103 // Push interpreter frames in a loop 3104 Label loop; 3105 __ bind(loop); 3106 __ movptr(rbx, Address(rsi, 0)); // Load frame size 3107 __ subptr(rbx, 2 * wordSize); // We'll push pc and rbp by hand 3108 __ pushptr(Address(rcx, 0)); // Save return address 3109 __ enter(); // Save old & set new rbp 3110 __ subptr(rsp, rbx); // Prolog 3111 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), 3112 sender_sp); // Make it walkable 3113 // This value is corrected by layout_activation_impl 3114 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD); 3115 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 3116 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 3117 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 3118 __ decrementl(rdx); // Decrement counter 3119 __ jcc(Assembler::notZero, loop); 3120 __ pushptr(Address(rcx, 0)); // Save final return address 3121 3122 // Re-push self-frame 3123 __ enter(); // Save old & set new rbp 3124 __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt); 3125 // Prolog 3126 3127 // Use rbp because the frames look interpreted now 3128 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 3129 // Don't need the precise return PC here, just precise enough to point into this code blob. 3130 address the_pc = __ pc(); 3131 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1); 3132 3133 // Call C code. Need thread but NOT official VM entry 3134 // crud. We cannot block on this call, no GC can happen. Call should 3135 // restore return values to their stack-slots with the new SP. 3136 // Thread is in rdi already. 3137 // 3138 // BasicType unpack_frames(JavaThread* thread, int exec_mode); 3139 3140 __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI 3141 __ mov(c_rarg0, r15_thread); 3142 __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap); 3143 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 3144 3145 // Set an oopmap for the call site 3146 // Use the same PC we used for the last java frame 3147 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 3148 3149 // Clear fp AND pc 3150 __ reset_last_Java_frame(true); 3151 3152 // Pop self-frame. 3153 __ leave(); // Epilog 3154 3155 // Jump to interpreter 3156 __ ret(0); 3157 3158 // Make sure all code is generated 3159 masm->flush(); 3160 3161 _uncommon_trap_blob = UncommonTrapBlob::create(&buffer, oop_maps, 3162 SimpleRuntimeFrame::framesize >> 1); 3163 } 3164 #endif // COMPILER2 3165 3166 //------------------------------generate_handler_blob------ 3167 // 3168 // Generate a special Compile2Runtime blob that saves all registers, 3169 // and setup oopmap. 3170 // 3171 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) { 3172 assert(StubRoutines::forward_exception_entry() != nullptr, 3173 "must be generated before"); 3174 3175 ResourceMark rm; 3176 OopMapSet *oop_maps = new OopMapSet(); 3177 OopMap* map; 3178 3179 // Allocate space for the code. Setup code generation tools. 3180 CodeBuffer buffer("handler_blob", 2348, 1024); 3181 MacroAssembler* masm = new MacroAssembler(&buffer); 3182 3183 address start = __ pc(); 3184 address call_pc = nullptr; 3185 int frame_size_in_words; 3186 bool cause_return = (poll_type == POLL_AT_RETURN); 3187 bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP); 3188 3189 // Make room for return address (or push it again) 3190 if (!cause_return) { 3191 __ push(rbx); 3192 } 3193 3194 // Save registers, fpu state, and flags 3195 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors); 3196 3197 // The following is basically a call_VM. However, we need the precise 3198 // address of the call in order to generate an oopmap. Hence, we do all the 3199 // work ourselves. 3200 3201 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next: 3202 3203 // The return address must always be correct so that frame constructor never 3204 // sees an invalid pc. 3205 3206 if (!cause_return) { 3207 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack. 3208 // Additionally, rbx is a callee saved register and we can look at it later to determine 3209 // if someone changed the return address for us! 3210 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset())); 3211 __ movptr(Address(rbp, wordSize), rbx); 3212 } 3213 3214 // Do the call 3215 __ mov(c_rarg0, r15_thread); 3216 __ call(RuntimeAddress(call_ptr)); 3217 3218 // Set an oopmap for the call site. This oopmap will map all 3219 // oop-registers and debug-info registers as callee-saved. This 3220 // will allow deoptimization at this safepoint to find all possible 3221 // debug-info recordings, as well as let GC find all oops. 3222 3223 oop_maps->add_gc_map( __ pc() - start, map); 3224 3225 Label noException; 3226 3227 __ reset_last_Java_frame(false); 3228 3229 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3230 __ jcc(Assembler::equal, noException); 3231 3232 // Exception pending 3233 3234 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3235 3236 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3237 3238 // No exception case 3239 __ bind(noException); 3240 3241 Label no_adjust; 3242 #ifdef ASSERT 3243 Label bail; 3244 #endif 3245 if (!cause_return) { 3246 Label no_prefix, not_special; 3247 3248 // If our stashed return pc was modified by the runtime we avoid touching it 3249 __ cmpptr(rbx, Address(rbp, wordSize)); 3250 __ jccb(Assembler::notEqual, no_adjust); 3251 3252 // Skip over the poll instruction. 3253 // See NativeInstruction::is_safepoint_poll() 3254 // Possible encodings: 3255 // 85 00 test %eax,(%rax) 3256 // 85 01 test %eax,(%rcx) 3257 // 85 02 test %eax,(%rdx) 3258 // 85 03 test %eax,(%rbx) 3259 // 85 06 test %eax,(%rsi) 3260 // 85 07 test %eax,(%rdi) 3261 // 3262 // 41 85 00 test %eax,(%r8) 3263 // 41 85 01 test %eax,(%r9) 3264 // 41 85 02 test %eax,(%r10) 3265 // 41 85 03 test %eax,(%r11) 3266 // 41 85 06 test %eax,(%r14) 3267 // 41 85 07 test %eax,(%r15) 3268 // 3269 // 85 04 24 test %eax,(%rsp) 3270 // 41 85 04 24 test %eax,(%r12) 3271 // 85 45 00 test %eax,0x0(%rbp) 3272 // 41 85 45 00 test %eax,0x0(%r13) 3273 3274 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix); 3275 __ jcc(Assembler::notEqual, no_prefix); 3276 __ addptr(rbx, 1); 3277 __ bind(no_prefix); 3278 #ifdef ASSERT 3279 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below 3280 #endif 3281 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values: 3282 // r12/rsp 0x04 3283 // r13/rbp 0x05 3284 __ movzbq(rcx, Address(rbx, 1)); 3285 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05 3286 __ subptr(rcx, 4); // looking for 0x00 .. 0x01 3287 __ cmpptr(rcx, 1); 3288 __ jcc(Assembler::above, not_special); 3289 __ addptr(rbx, 1); 3290 __ bind(not_special); 3291 #ifdef ASSERT 3292 // Verify the correct encoding of the poll we're about to skip. 3293 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl); 3294 __ jcc(Assembler::notEqual, bail); 3295 // Mask out the modrm bits 3296 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask); 3297 // rax encodes to 0, so if the bits are nonzero it's incorrect 3298 __ jcc(Assembler::notZero, bail); 3299 #endif 3300 // Adjust return pc forward to step over the safepoint poll instruction 3301 __ addptr(rbx, 2); 3302 __ movptr(Address(rbp, wordSize), rbx); 3303 } 3304 3305 __ bind(no_adjust); 3306 // Normal exit, restore registers and exit. 3307 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3308 __ ret(0); 3309 3310 #ifdef ASSERT 3311 __ bind(bail); 3312 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected"); 3313 #endif 3314 3315 // Make sure all code is generated 3316 masm->flush(); 3317 3318 // Fill-out other meta info 3319 return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words); 3320 } 3321 3322 // 3323 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss 3324 // 3325 // Generate a stub that calls into vm to find out the proper destination 3326 // of a java call. All the argument registers are live at this point 3327 // but since this is generic code we don't know what they are and the caller 3328 // must do any gc of the args. 3329 // 3330 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) { 3331 assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before"); 3332 3333 // allocate space for the code 3334 ResourceMark rm; 3335 3336 CodeBuffer buffer(name, 1552, 512); 3337 MacroAssembler* masm = new MacroAssembler(&buffer); 3338 3339 int frame_size_in_words; 3340 3341 OopMapSet *oop_maps = new OopMapSet(); 3342 OopMap* map = nullptr; 3343 3344 int start = __ offset(); 3345 3346 // No need to save vector registers since they are caller-saved anyway. 3347 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false); 3348 3349 int frame_complete = __ offset(); 3350 3351 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 3352 3353 __ mov(c_rarg0, r15_thread); 3354 3355 __ call(RuntimeAddress(destination)); 3356 3357 3358 // Set an oopmap for the call site. 3359 // We need this not only for callee-saved registers, but also for volatile 3360 // registers that the compiler might be keeping live across a safepoint. 3361 3362 oop_maps->add_gc_map( __ offset() - start, map); 3363 3364 // rax contains the address we are going to jump to assuming no exception got installed 3365 3366 // clear last_Java_sp 3367 __ reset_last_Java_frame(false); 3368 // check for pending exceptions 3369 Label pending; 3370 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3371 __ jcc(Assembler::notEqual, pending); 3372 3373 // get the returned Method* 3374 __ get_vm_result_2(rbx, r15_thread); 3375 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx); 3376 3377 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 3378 3379 RegisterSaver::restore_live_registers(masm); 3380 3381 // We are back to the original state on entry and ready to go. 3382 3383 __ jmp(rax); 3384 3385 // Pending exception after the safepoint 3386 3387 __ bind(pending); 3388 3389 RegisterSaver::restore_live_registers(masm); 3390 3391 // exception pending => remove activation and forward to exception handler 3392 3393 __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD); 3394 3395 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 3396 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3397 3398 // ------------- 3399 // make sure all code is generated 3400 masm->flush(); 3401 3402 // return the blob 3403 // frame_size_words or bytes?? 3404 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); 3405 } 3406 3407 //------------------------------Montgomery multiplication------------------------ 3408 // 3409 3410 #ifndef _WINDOWS 3411 3412 // Subtract 0:b from carry:a. Return carry. 3413 static julong 3414 sub(julong a[], julong b[], julong carry, long len) { 3415 long long i = 0, cnt = len; 3416 julong tmp; 3417 asm volatile("clc; " 3418 "0: ; " 3419 "mov (%[b], %[i], 8), %[tmp]; " 3420 "sbb %[tmp], (%[a], %[i], 8); " 3421 "inc %[i]; dec %[cnt]; " 3422 "jne 0b; " 3423 "mov %[carry], %[tmp]; sbb $0, %[tmp]; " 3424 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp) 3425 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry) 3426 : "memory"); 3427 return tmp; 3428 } 3429 3430 // Multiply (unsigned) Long A by Long B, accumulating the double- 3431 // length result into the accumulator formed of T0, T1, and T2. 3432 #define MACC(A, B, T0, T1, T2) \ 3433 do { \ 3434 unsigned long hi, lo; \ 3435 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3436 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3437 : "r"(A), "a"(B) : "cc"); \ 3438 } while(0) 3439 3440 // As above, but add twice the double-length result into the 3441 // accumulator. 3442 #define MACC2(A, B, T0, T1, T2) \ 3443 do { \ 3444 unsigned long hi, lo; \ 3445 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \ 3446 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3447 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3448 : "r"(A), "a"(B) : "cc"); \ 3449 } while(0) 3450 3451 #else //_WINDOWS 3452 3453 static julong 3454 sub(julong a[], julong b[], julong carry, long len) { 3455 long i; 3456 julong tmp; 3457 unsigned char c = 1; 3458 for (i = 0; i < len; i++) { 3459 c = _addcarry_u64(c, a[i], ~b[i], &tmp); 3460 a[i] = tmp; 3461 } 3462 c = _addcarry_u64(c, carry, ~0, &tmp); 3463 return tmp; 3464 } 3465 3466 // Multiply (unsigned) Long A by Long B, accumulating the double- 3467 // length result into the accumulator formed of T0, T1, and T2. 3468 #define MACC(A, B, T0, T1, T2) \ 3469 do { \ 3470 julong hi, lo; \ 3471 lo = _umul128(A, B, &hi); \ 3472 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3473 c = _addcarry_u64(c, hi, T1, &T1); \ 3474 _addcarry_u64(c, T2, 0, &T2); \ 3475 } while(0) 3476 3477 // As above, but add twice the double-length result into the 3478 // accumulator. 3479 #define MACC2(A, B, T0, T1, T2) \ 3480 do { \ 3481 julong hi, lo; \ 3482 lo = _umul128(A, B, &hi); \ 3483 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3484 c = _addcarry_u64(c, hi, T1, &T1); \ 3485 _addcarry_u64(c, T2, 0, &T2); \ 3486 c = _addcarry_u64(0, lo, T0, &T0); \ 3487 c = _addcarry_u64(c, hi, T1, &T1); \ 3488 _addcarry_u64(c, T2, 0, &T2); \ 3489 } while(0) 3490 3491 #endif //_WINDOWS 3492 3493 // Fast Montgomery multiplication. The derivation of the algorithm is 3494 // in A Cryptographic Library for the Motorola DSP56000, 3495 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3496 3497 static void NOINLINE 3498 montgomery_multiply(julong a[], julong b[], julong n[], 3499 julong m[], julong inv, int len) { 3500 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3501 int i; 3502 3503 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply"); 3504 3505 for (i = 0; i < len; i++) { 3506 int j; 3507 for (j = 0; j < i; j++) { 3508 MACC(a[j], b[i-j], t0, t1, t2); 3509 MACC(m[j], n[i-j], t0, t1, t2); 3510 } 3511 MACC(a[i], b[0], t0, t1, t2); 3512 m[i] = t0 * inv; 3513 MACC(m[i], n[0], t0, t1, t2); 3514 3515 assert(t0 == 0, "broken Montgomery multiply"); 3516 3517 t0 = t1; t1 = t2; t2 = 0; 3518 } 3519 3520 for (i = len; i < 2*len; i++) { 3521 int j; 3522 for (j = i-len+1; j < len; j++) { 3523 MACC(a[j], b[i-j], t0, t1, t2); 3524 MACC(m[j], n[i-j], t0, t1, t2); 3525 } 3526 m[i-len] = t0; 3527 t0 = t1; t1 = t2; t2 = 0; 3528 } 3529 3530 while (t0) 3531 t0 = sub(m, n, t0, len); 3532 } 3533 3534 // Fast Montgomery squaring. This uses asymptotically 25% fewer 3535 // multiplies so it should be up to 25% faster than Montgomery 3536 // multiplication. However, its loop control is more complex and it 3537 // may actually run slower on some machines. 3538 3539 static void NOINLINE 3540 montgomery_square(julong a[], julong n[], 3541 julong m[], julong inv, int len) { 3542 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3543 int i; 3544 3545 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square"); 3546 3547 for (i = 0; i < len; i++) { 3548 int j; 3549 int end = (i+1)/2; 3550 for (j = 0; j < end; j++) { 3551 MACC2(a[j], a[i-j], t0, t1, t2); 3552 MACC(m[j], n[i-j], t0, t1, t2); 3553 } 3554 if ((i & 1) == 0) { 3555 MACC(a[j], a[j], t0, t1, t2); 3556 } 3557 for (; j < i; j++) { 3558 MACC(m[j], n[i-j], t0, t1, t2); 3559 } 3560 m[i] = t0 * inv; 3561 MACC(m[i], n[0], t0, t1, t2); 3562 3563 assert(t0 == 0, "broken Montgomery square"); 3564 3565 t0 = t1; t1 = t2; t2 = 0; 3566 } 3567 3568 for (i = len; i < 2*len; i++) { 3569 int start = i-len+1; 3570 int end = start + (len - start)/2; 3571 int j; 3572 for (j = start; j < end; j++) { 3573 MACC2(a[j], a[i-j], t0, t1, t2); 3574 MACC(m[j], n[i-j], t0, t1, t2); 3575 } 3576 if ((i & 1) == 0) { 3577 MACC(a[j], a[j], t0, t1, t2); 3578 } 3579 for (; j < len; j++) { 3580 MACC(m[j], n[i-j], t0, t1, t2); 3581 } 3582 m[i-len] = t0; 3583 t0 = t1; t1 = t2; t2 = 0; 3584 } 3585 3586 while (t0) 3587 t0 = sub(m, n, t0, len); 3588 } 3589 3590 // Swap words in a longword. 3591 static julong swap(julong x) { 3592 return (x << 32) | (x >> 32); 3593 } 3594 3595 // Copy len longwords from s to d, word-swapping as we go. The 3596 // destination array is reversed. 3597 static void reverse_words(julong *s, julong *d, int len) { 3598 d += len; 3599 while(len-- > 0) { 3600 d--; 3601 *d = swap(*s); 3602 s++; 3603 } 3604 } 3605 3606 // The threshold at which squaring is advantageous was determined 3607 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. 3608 #define MONTGOMERY_SQUARING_THRESHOLD 64 3609 3610 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, 3611 jint len, jlong inv, 3612 jint *m_ints) { 3613 assert(len % 2 == 0, "array length in montgomery_multiply must be even"); 3614 int longwords = len/2; 3615 3616 // Make very sure we don't use so much space that the stack might 3617 // overflow. 512 jints corresponds to an 16384-bit integer and 3618 // will use here a total of 8k bytes of stack space. 3619 int divisor = sizeof(julong) * 4; 3620 guarantee(longwords <= 8192 / divisor, "must be"); 3621 int total_allocation = longwords * sizeof (julong) * 4; 3622 julong *scratch = (julong *)alloca(total_allocation); 3623 3624 // Local scratch arrays 3625 julong 3626 *a = scratch + 0 * longwords, 3627 *b = scratch + 1 * longwords, 3628 *n = scratch + 2 * longwords, 3629 *m = scratch + 3 * longwords; 3630 3631 reverse_words((julong *)a_ints, a, longwords); 3632 reverse_words((julong *)b_ints, b, longwords); 3633 reverse_words((julong *)n_ints, n, longwords); 3634 3635 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords); 3636 3637 reverse_words(m, (julong *)m_ints, longwords); 3638 } 3639 3640 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, 3641 jint len, jlong inv, 3642 jint *m_ints) { 3643 assert(len % 2 == 0, "array length in montgomery_square must be even"); 3644 int longwords = len/2; 3645 3646 // Make very sure we don't use so much space that the stack might 3647 // overflow. 512 jints corresponds to an 16384-bit integer and 3648 // will use here a total of 6k bytes of stack space. 3649 int divisor = sizeof(julong) * 3; 3650 guarantee(longwords <= (8192 / divisor), "must be"); 3651 int total_allocation = longwords * sizeof (julong) * 3; 3652 julong *scratch = (julong *)alloca(total_allocation); 3653 3654 // Local scratch arrays 3655 julong 3656 *a = scratch + 0 * longwords, 3657 *n = scratch + 1 * longwords, 3658 *m = scratch + 2 * longwords; 3659 3660 reverse_words((julong *)a_ints, a, longwords); 3661 reverse_words((julong *)n_ints, n, longwords); 3662 3663 if (len >= MONTGOMERY_SQUARING_THRESHOLD) { 3664 ::montgomery_square(a, n, m, (julong)inv, longwords); 3665 } else { 3666 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords); 3667 } 3668 3669 reverse_words(m, (julong *)m_ints, longwords); 3670 } 3671 3672 #ifdef COMPILER2 3673 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame 3674 // 3675 //------------------------------generate_exception_blob--------------------------- 3676 // creates exception blob at the end 3677 // Using exception blob, this code is jumped from a compiled method. 3678 // (see emit_exception_handler in x86_64.ad file) 3679 // 3680 // Given an exception pc at a call we call into the runtime for the 3681 // handler in this method. This handler might merely restore state 3682 // (i.e. callee save registers) unwind the frame and jump to the 3683 // exception handler for the nmethod if there is no Java level handler 3684 // for the nmethod. 3685 // 3686 // This code is entered with a jmp. 3687 // 3688 // Arguments: 3689 // rax: exception oop 3690 // rdx: exception pc 3691 // 3692 // Results: 3693 // rax: exception oop 3694 // rdx: exception pc in caller or ??? 3695 // destination: exception handler of caller 3696 // 3697 // Note: the exception pc MUST be at a call (precise debug information) 3698 // Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved. 3699 // 3700 3701 void OptoRuntime::generate_exception_blob() { 3702 assert(!OptoRuntime::is_callee_saved_register(RDX_num), ""); 3703 assert(!OptoRuntime::is_callee_saved_register(RAX_num), ""); 3704 assert(!OptoRuntime::is_callee_saved_register(RCX_num), ""); 3705 3706 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 3707 3708 // Allocate space for the code 3709 ResourceMark rm; 3710 // Setup code generation tools 3711 CodeBuffer buffer("exception_blob", 2048, 1024); 3712 MacroAssembler* masm = new MacroAssembler(&buffer); 3713 3714 3715 address start = __ pc(); 3716 3717 // Exception pc is 'return address' for stack walker 3718 __ push(rdx); 3719 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog 3720 3721 // Save callee-saved registers. See x86_64.ad. 3722 3723 // rbp is an implicitly saved callee saved register (i.e., the calling 3724 // convention will save/restore it in the prolog/epilog). Other than that 3725 // there are no callee save registers now that adapter frames are gone. 3726 3727 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 3728 3729 // Store exception in Thread object. We cannot pass any arguments to the 3730 // handle_exception call, since we do not want to make any assumption 3731 // about the size of the frame where the exception happened in. 3732 // c_rarg0 is either rdi (Linux) or rcx (Windows). 3733 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax); 3734 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 3735 3736 // This call does all the hard work. It checks if an exception handler 3737 // exists in the method. 3738 // If so, it returns the handler address. 3739 // If not, it prepares for stack-unwinding, restoring the callee-save 3740 // registers of the frame being removed. 3741 // 3742 // address OptoRuntime::handle_exception_C(JavaThread* thread) 3743 3744 // At a method handle call, the stack may not be properly aligned 3745 // when returning with an exception. 3746 address the_pc = __ pc(); 3747 __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1); 3748 __ mov(c_rarg0, r15_thread); 3749 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack 3750 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C))); 3751 3752 // Set an oopmap for the call site. This oopmap will only be used if we 3753 // are unwinding the stack. Hence, all locations will be dead. 3754 // Callee-saved registers will be the same as the frame above (i.e., 3755 // handle_exception_stub), since they were restored when we got the 3756 // exception. 3757 3758 OopMapSet* oop_maps = new OopMapSet(); 3759 3760 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 3761 3762 __ reset_last_Java_frame(false); 3763 3764 // Restore callee-saved registers 3765 3766 // rbp is an implicitly saved callee-saved register (i.e., the calling 3767 // convention will save restore it in prolog/epilog) Other than that 3768 // there are no callee save registers now that adapter frames are gone. 3769 3770 __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt)); 3771 3772 __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog 3773 __ pop(rdx); // No need for exception pc anymore 3774 3775 // rax: exception handler 3776 3777 // We have a handler in rax (could be deopt blob). 3778 __ mov(r8, rax); 3779 3780 // Get the exception oop 3781 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 3782 // Get the exception pc in case we are deoptimized 3783 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 3784 #ifdef ASSERT 3785 __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD); 3786 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 3787 #endif 3788 // Clear the exception oop so GC no longer processes it as a root. 3789 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD); 3790 3791 // rax: exception oop 3792 // r8: exception handler 3793 // rdx: exception pc 3794 // Jump to handler 3795 3796 __ jmp(r8); 3797 3798 // Make sure all code is generated 3799 masm->flush(); 3800 3801 // Set exception blob 3802 _exception_blob = ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1); 3803 } 3804 #endif // COMPILER2