1 /* 2 * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #ifndef _WINDOWS 27 #include "alloca.h" 28 #endif 29 #include "asm/macroAssembler.hpp" 30 #include "asm/macroAssembler.inline.hpp" 31 #include "code/compiledIC.hpp" 32 #include "code/debugInfoRec.hpp" 33 #include "code/nativeInst.hpp" 34 #include "code/SCCache.hpp" 35 #include "code/vtableStubs.hpp" 36 #include "compiler/oopMap.hpp" 37 #include "gc/shared/collectedHeap.hpp" 38 #include "gc/shared/gcLocker.hpp" 39 #include "gc/shared/barrierSet.hpp" 40 #include "gc/shared/barrierSetAssembler.hpp" 41 #include "interpreter/interpreter.hpp" 42 #include "logging/log.hpp" 43 #include "memory/resourceArea.hpp" 44 #include "memory/universe.hpp" 45 #include "oops/klass.inline.hpp" 46 #include "oops/method.inline.hpp" 47 #include "prims/methodHandles.hpp" 48 #include "runtime/continuation.hpp" 49 #include "runtime/continuationEntry.inline.hpp" 50 #include "runtime/globals.hpp" 51 #include "runtime/jniHandles.hpp" 52 #include "runtime/safepointMechanism.hpp" 53 #include "runtime/sharedRuntime.hpp" 54 #include "runtime/signature.hpp" 55 #include "runtime/stubRoutines.hpp" 56 #include "runtime/vframeArray.hpp" 57 #include "runtime/vm_version.hpp" 58 #include "utilities/align.hpp" 59 #include "utilities/checkedCast.hpp" 60 #include "utilities/formatBuffer.hpp" 61 #include "vmreg_x86.inline.hpp" 62 #ifdef COMPILER1 63 #include "c1/c1_Runtime1.hpp" 64 #endif 65 #ifdef COMPILER2 66 #include "opto/runtime.hpp" 67 #endif 68 #if INCLUDE_JVMCI 69 #include "jvmci/jvmciJavaClasses.hpp" 70 #endif 71 72 #define __ masm-> 73 74 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size; 75 76 class SimpleRuntimeFrame { 77 78 public: 79 80 // Most of the runtime stubs have this simple frame layout. 81 // This class exists to make the layout shared in one place. 82 // Offsets are for compiler stack slots, which are jints. 83 enum layout { 84 // The frame sender code expects that rbp will be in the "natural" place and 85 // will override any oopMap setting for it. We must therefore force the layout 86 // so that it agrees with the frame sender code. 87 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt, 88 rbp_off2, 89 return_off, return_off2, 90 framesize 91 }; 92 }; 93 94 class RegisterSaver { 95 // Capture info about frame layout. Layout offsets are in jint 96 // units because compiler frame slots are jints. 97 #define XSAVE_AREA_BEGIN 160 98 #define XSAVE_AREA_YMM_BEGIN 576 99 #define XSAVE_AREA_OPMASK_BEGIN 1088 100 #define XSAVE_AREA_ZMM_BEGIN 1152 101 #define XSAVE_AREA_UPPERBANK 1664 102 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off 103 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off 104 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off 105 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off 106 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off 107 enum layout { 108 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area 109 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area 110 DEF_XMM_OFFS(0), 111 DEF_XMM_OFFS(1), 112 // 2..15 are implied in range usage 113 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 114 DEF_YMM_OFFS(0), 115 DEF_YMM_OFFS(1), 116 // 2..15 are implied in range usage 117 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 118 DEF_OPMASK_OFFS(0), 119 DEF_OPMASK_OFFS(1), 120 // 2..7 are implied in range usage 121 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 122 DEF_ZMM_OFFS(0), 123 DEF_ZMM_OFFS(1), 124 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt, 125 DEF_ZMM_UPPER_OFFS(16), 126 DEF_ZMM_UPPER_OFFS(17), 127 // 18..31 are implied in range usage 128 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt), 129 fpu_stateH_end, 130 r15_off, r15H_off, 131 r14_off, r14H_off, 132 r13_off, r13H_off, 133 r12_off, r12H_off, 134 r11_off, r11H_off, 135 r10_off, r10H_off, 136 r9_off, r9H_off, 137 r8_off, r8H_off, 138 rdi_off, rdiH_off, 139 rsi_off, rsiH_off, 140 ignore_off, ignoreH_off, // extra copy of rbp 141 rsp_off, rspH_off, 142 rbx_off, rbxH_off, 143 rdx_off, rdxH_off, 144 rcx_off, rcxH_off, 145 rax_off, raxH_off, 146 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state 147 align_off, alignH_off, 148 flags_off, flagsH_off, 149 // The frame sender code expects that rbp will be in the "natural" place and 150 // will override any oopMap setting for it. We must therefore force the layout 151 // so that it agrees with the frame sender code. 152 rbp_off, rbpH_off, // copy of rbp we will restore 153 return_off, returnH_off, // slot for return address 154 reg_save_size // size in compiler stack slots 155 }; 156 157 public: 158 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors); 159 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false); 160 161 // Offsets into the register save area 162 // Used by deoptimization when it is managing result register 163 // values on its own 164 165 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; } 166 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; } 167 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; } 168 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; } 169 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; } 170 171 // During deoptimization only the result registers need to be restored, 172 // all the other values have already been extracted. 173 static void restore_result_registers(MacroAssembler* masm); 174 }; 175 176 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) { 177 int off = 0; 178 int num_xmm_regs = XMMRegister::available_xmm_registers(); 179 #if COMPILER2_OR_JVMCI 180 if (save_wide_vectors && UseAVX == 0) { 181 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX 182 } 183 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 184 #else 185 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI 186 #endif 187 188 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated 189 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs); 190 // OopMap frame size is in compiler stack slots (jint's) not bytes or words 191 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 192 // CodeBlob frame size is in words. 193 int frame_size_in_words = frame_size_in_bytes / wordSize; 194 *total_frame_words = frame_size_in_words; 195 196 // Save registers, fpu state, and flags. 197 // We assume caller has already pushed the return address onto the 198 // stack, so rsp is 8-byte aligned here. 199 // We push rpb twice in this sequence because we want the real rbp 200 // to be under the return like a normal enter. 201 202 __ enter(); // rsp becomes 16-byte aligned here 203 __ push_CPU_state(); // Push a multiple of 16 bytes 204 205 // push cpu state handles this on EVEX enabled targets 206 if (save_wide_vectors) { 207 // Save upper half of YMM registers(0..15) 208 int base_addr = XSAVE_AREA_YMM_BEGIN; 209 for (int n = 0; n < 16; n++) { 210 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n)); 211 } 212 if (VM_Version::supports_evex()) { 213 // Save upper half of ZMM registers(0..15) 214 base_addr = XSAVE_AREA_ZMM_BEGIN; 215 for (int n = 0; n < 16; n++) { 216 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n)); 217 } 218 // Save full ZMM registers(16..num_xmm_regs) 219 base_addr = XSAVE_AREA_UPPERBANK; 220 off = 0; 221 int vector_len = Assembler::AVX_512bit; 222 for (int n = 16; n < num_xmm_regs; n++) { 223 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 224 } 225 #if COMPILER2_OR_JVMCI 226 base_addr = XSAVE_AREA_OPMASK_BEGIN; 227 off = 0; 228 for(int n = 0; n < KRegister::number_of_registers; n++) { 229 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 230 } 231 #endif 232 } 233 } else { 234 if (VM_Version::supports_evex()) { 235 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 236 int base_addr = XSAVE_AREA_UPPERBANK; 237 off = 0; 238 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 239 for (int n = 16; n < num_xmm_regs; n++) { 240 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 241 } 242 #if COMPILER2_OR_JVMCI 243 base_addr = XSAVE_AREA_OPMASK_BEGIN; 244 off = 0; 245 for(int n = 0; n < KRegister::number_of_registers; n++) { 246 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 247 } 248 #endif 249 } 250 } 251 __ vzeroupper(); 252 if (frame::arg_reg_save_area_bytes != 0) { 253 // Allocate argument register save area 254 __ subptr(rsp, frame::arg_reg_save_area_bytes); 255 } 256 257 // Set an oopmap for the call site. This oopmap will map all 258 // oop-registers and debug-info registers as callee-saved. This 259 // will allow deoptimization at this safepoint to find all possible 260 // debug-info recordings, as well as let GC find all oops. 261 262 OopMapSet *oop_maps = new OopMapSet(); 263 OopMap* map = new OopMap(frame_size_in_slots, 0); 264 265 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x)) 266 267 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg()); 268 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg()); 269 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg()); 270 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg()); 271 // rbp location is known implicitly by the frame sender code, needs no oopmap 272 // and the location where rbp was saved by is ignored 273 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg()); 274 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg()); 275 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg()); 276 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg()); 277 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg()); 278 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg()); 279 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg()); 280 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg()); 281 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg()); 282 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg()); 283 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 284 // on EVEX enabled targets, we get it included in the xsave area 285 off = xmm0_off; 286 int delta = xmm1_off - off; 287 for (int n = 0; n < 16; n++) { 288 XMMRegister xmm_name = as_XMMRegister(n); 289 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()); 290 off += delta; 291 } 292 if (UseAVX > 2) { 293 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 294 off = zmm16_off; 295 delta = zmm17_off - off; 296 for (int n = 16; n < num_xmm_regs; n++) { 297 XMMRegister zmm_name = as_XMMRegister(n); 298 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()); 299 off += delta; 300 } 301 } 302 303 #if COMPILER2_OR_JVMCI 304 if (save_wide_vectors) { 305 // Save upper half of YMM registers(0..15) 306 off = ymm0_off; 307 delta = ymm1_off - ymm0_off; 308 for (int n = 0; n < 16; n++) { 309 XMMRegister ymm_name = as_XMMRegister(n); 310 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4)); 311 off += delta; 312 } 313 if (VM_Version::supports_evex()) { 314 // Save upper half of ZMM registers(0..15) 315 off = zmm0_off; 316 delta = zmm1_off - zmm0_off; 317 for (int n = 0; n < 16; n++) { 318 XMMRegister zmm_name = as_XMMRegister(n); 319 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8)); 320 off += delta; 321 } 322 } 323 } 324 #endif // COMPILER2_OR_JVMCI 325 326 // %%% These should all be a waste but we'll keep things as they were for now 327 if (true) { 328 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next()); 329 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next()); 330 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next()); 331 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next()); 332 // rbp location is known implicitly by the frame sender code, needs no oopmap 333 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next()); 334 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next()); 335 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next()); 336 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next()); 337 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next()); 338 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next()); 339 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next()); 340 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next()); 341 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next()); 342 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next()); 343 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 344 // on EVEX enabled targets, we get it included in the xsave area 345 off = xmm0H_off; 346 delta = xmm1H_off - off; 347 for (int n = 0; n < 16; n++) { 348 XMMRegister xmm_name = as_XMMRegister(n); 349 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next()); 350 off += delta; 351 } 352 if (UseAVX > 2) { 353 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 354 off = zmm16H_off; 355 delta = zmm17H_off - off; 356 for (int n = 16; n < num_xmm_regs; n++) { 357 XMMRegister zmm_name = as_XMMRegister(n); 358 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next()); 359 off += delta; 360 } 361 } 362 } 363 364 return map; 365 } 366 367 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) { 368 int num_xmm_regs = XMMRegister::available_xmm_registers(); 369 if (frame::arg_reg_save_area_bytes != 0) { 370 // Pop arg register save area 371 __ addptr(rsp, frame::arg_reg_save_area_bytes); 372 } 373 374 #if COMPILER2_OR_JVMCI 375 if (restore_wide_vectors) { 376 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); 377 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 378 } 379 #else 380 assert(!restore_wide_vectors, "vectors are generated only by C2"); 381 #endif 382 383 __ vzeroupper(); 384 385 // On EVEX enabled targets everything is handled in pop fpu state 386 if (restore_wide_vectors) { 387 // Restore upper half of YMM registers (0..15) 388 int base_addr = XSAVE_AREA_YMM_BEGIN; 389 for (int n = 0; n < 16; n++) { 390 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16)); 391 } 392 if (VM_Version::supports_evex()) { 393 // Restore upper half of ZMM registers (0..15) 394 base_addr = XSAVE_AREA_ZMM_BEGIN; 395 for (int n = 0; n < 16; n++) { 396 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32)); 397 } 398 // Restore full ZMM registers(16..num_xmm_regs) 399 base_addr = XSAVE_AREA_UPPERBANK; 400 int vector_len = Assembler::AVX_512bit; 401 int off = 0; 402 for (int n = 16; n < num_xmm_regs; n++) { 403 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 404 } 405 #if COMPILER2_OR_JVMCI 406 base_addr = XSAVE_AREA_OPMASK_BEGIN; 407 off = 0; 408 for (int n = 0; n < KRegister::number_of_registers; n++) { 409 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 410 } 411 #endif 412 } 413 } else { 414 if (VM_Version::supports_evex()) { 415 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 416 int base_addr = XSAVE_AREA_UPPERBANK; 417 int off = 0; 418 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 419 for (int n = 16; n < num_xmm_regs; n++) { 420 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 421 } 422 #if COMPILER2_OR_JVMCI 423 base_addr = XSAVE_AREA_OPMASK_BEGIN; 424 off = 0; 425 for (int n = 0; n < KRegister::number_of_registers; n++) { 426 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 427 } 428 #endif 429 } 430 } 431 432 // Recover CPU state 433 __ pop_CPU_state(); 434 // Get the rbp described implicitly by the calling convention (no oopMap) 435 __ pop(rbp); 436 } 437 438 void RegisterSaver::restore_result_registers(MacroAssembler* masm) { 439 440 // Just restore result register. Only used by deoptimization. By 441 // now any callee save register that needs to be restored to a c2 442 // caller of the deoptee has been extracted into the vframeArray 443 // and will be stuffed into the c2i adapter we create for later 444 // restoration so only result registers need to be restored here. 445 446 // Restore fp result register 447 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes())); 448 // Restore integer result register 449 __ movptr(rax, Address(rsp, rax_offset_in_bytes())); 450 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes())); 451 452 // Pop all of the register save are off the stack except the return address 453 __ addptr(rsp, return_offset_in_bytes()); 454 } 455 456 // Is vector's size (in bytes) bigger than a size saved by default? 457 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions. 458 bool SharedRuntime::is_wide_vector(int size) { 459 return size > 16; 460 } 461 462 // --------------------------------------------------------------------------- 463 // Read the array of BasicTypes from a signature, and compute where the 464 // arguments should go. Values in the VMRegPair regs array refer to 4-byte 465 // quantities. Values less than VMRegImpl::stack0 are registers, those above 466 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer 467 // as framesizes are fixed. 468 // VMRegImpl::stack0 refers to the first slot 0(sp). 469 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher. 470 // Register up to Register::number_of_registers are the 64-bit 471 // integer registers. 472 473 // Note: the INPUTS in sig_bt are in units of Java argument words, which are 474 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit 475 // units regardless of build. Of course for i486 there is no 64 bit build 476 477 // The Java calling convention is a "shifted" version of the C ABI. 478 // By skipping the first C ABI register we can call non-static jni methods 479 // with small numbers of arguments without having to shuffle the arguments 480 // at all. Since we control the java ABI we ought to at least get some 481 // advantage out of it. 482 483 int SharedRuntime::java_calling_convention(const BasicType *sig_bt, 484 VMRegPair *regs, 485 int total_args_passed) { 486 487 // Create the mapping between argument positions and 488 // registers. 489 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = { 490 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5 491 }; 492 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = { 493 j_farg0, j_farg1, j_farg2, j_farg3, 494 j_farg4, j_farg5, j_farg6, j_farg7 495 }; 496 497 498 uint int_args = 0; 499 uint fp_args = 0; 500 uint stk_args = 0; 501 502 for (int i = 0; i < total_args_passed; i++) { 503 switch (sig_bt[i]) { 504 case T_BOOLEAN: 505 case T_CHAR: 506 case T_BYTE: 507 case T_SHORT: 508 case T_INT: 509 if (int_args < Argument::n_int_register_parameters_j) { 510 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 511 } else { 512 stk_args = align_up(stk_args, 2); 513 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 514 stk_args += 1; 515 } 516 break; 517 case T_VOID: 518 // halves of T_LONG or T_DOUBLE 519 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 520 regs[i].set_bad(); 521 break; 522 case T_LONG: 523 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 524 // fall through 525 case T_OBJECT: 526 case T_ARRAY: 527 case T_ADDRESS: 528 if (int_args < Argument::n_int_register_parameters_j) { 529 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 530 } else { 531 stk_args = align_up(stk_args, 2); 532 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 533 stk_args += 2; 534 } 535 break; 536 case T_FLOAT: 537 if (fp_args < Argument::n_float_register_parameters_j) { 538 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 539 } else { 540 stk_args = align_up(stk_args, 2); 541 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 542 stk_args += 1; 543 } 544 break; 545 case T_DOUBLE: 546 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 547 if (fp_args < Argument::n_float_register_parameters_j) { 548 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 549 } else { 550 stk_args = align_up(stk_args, 2); 551 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 552 stk_args += 2; 553 } 554 break; 555 default: 556 ShouldNotReachHere(); 557 break; 558 } 559 } 560 561 return stk_args; 562 } 563 564 // Patch the callers callsite with entry to compiled code if it exists. 565 static void patch_callers_callsite(MacroAssembler *masm) { 566 Label L; 567 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 568 __ jcc(Assembler::equal, L); 569 570 // Save the current stack pointer 571 __ mov(r13, rsp); 572 // Schedule the branch target address early. 573 // Call into the VM to patch the caller, then jump to compiled callee 574 // rax isn't live so capture return address while we easily can 575 __ movptr(rax, Address(rsp, 0)); 576 577 // align stack so push_CPU_state doesn't fault 578 __ andptr(rsp, -(StackAlignmentInBytes)); 579 __ push_CPU_state(); 580 __ vzeroupper(); 581 // VM needs caller's callsite 582 // VM needs target method 583 // This needs to be a long call since we will relocate this adapter to 584 // the codeBuffer and it may not reach 585 586 // Allocate argument register save area 587 if (frame::arg_reg_save_area_bytes != 0) { 588 __ subptr(rsp, frame::arg_reg_save_area_bytes); 589 } 590 __ mov(c_rarg0, rbx); 591 __ mov(c_rarg1, rax); 592 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite))); 593 594 // De-allocate argument register save area 595 if (frame::arg_reg_save_area_bytes != 0) { 596 __ addptr(rsp, frame::arg_reg_save_area_bytes); 597 } 598 599 __ vzeroupper(); 600 __ pop_CPU_state(); 601 // restore sp 602 __ mov(rsp, r13); 603 __ bind(L); 604 } 605 606 607 static void gen_c2i_adapter(MacroAssembler *masm, 608 int total_args_passed, 609 int comp_args_on_stack, 610 const BasicType *sig_bt, 611 const VMRegPair *regs, 612 Label& skip_fixup) { 613 // Before we get into the guts of the C2I adapter, see if we should be here 614 // at all. We've come from compiled code and are attempting to jump to the 615 // interpreter, which means the caller made a static call to get here 616 // (vcalls always get a compiled target if there is one). Check for a 617 // compiled target. If there is one, we need to patch the caller's call. 618 patch_callers_callsite(masm); 619 620 __ bind(skip_fixup); 621 622 // Since all args are passed on the stack, total_args_passed * 623 // Interpreter::stackElementSize is the space we need. 624 625 assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed); 626 627 int extraspace = (total_args_passed * Interpreter::stackElementSize); 628 629 // stack is aligned, keep it that way 630 // This is not currently needed or enforced by the interpreter, but 631 // we might as well conform to the ABI. 632 extraspace = align_up(extraspace, 2*wordSize); 633 634 // set senderSP value 635 __ lea(r13, Address(rsp, wordSize)); 636 637 #ifdef ASSERT 638 __ check_stack_alignment(r13, "sender stack not aligned"); 639 #endif 640 if (extraspace > 0) { 641 // Pop the return address 642 __ pop(rax); 643 644 __ subptr(rsp, extraspace); 645 646 // Push the return address 647 __ push(rax); 648 649 // Account for the return address location since we store it first rather 650 // than hold it in a register across all the shuffling 651 extraspace += wordSize; 652 } 653 654 #ifdef ASSERT 655 __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax); 656 #endif 657 658 // Now write the args into the outgoing interpreter space 659 for (int i = 0; i < total_args_passed; i++) { 660 if (sig_bt[i] == T_VOID) { 661 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 662 continue; 663 } 664 665 // offset to start parameters 666 int st_off = (total_args_passed - i) * Interpreter::stackElementSize; 667 int next_off = st_off - Interpreter::stackElementSize; 668 669 // Say 4 args: 670 // i st_off 671 // 0 32 T_LONG 672 // 1 24 T_VOID 673 // 2 16 T_OBJECT 674 // 3 8 T_BOOL 675 // - 0 return address 676 // 677 // However to make thing extra confusing. Because we can fit a long/double in 678 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter 679 // leaves one slot empty and only stores to a single slot. In this case the 680 // slot that is occupied is the T_VOID slot. See I said it was confusing. 681 682 VMReg r_1 = regs[i].first(); 683 VMReg r_2 = regs[i].second(); 684 if (!r_1->is_valid()) { 685 assert(!r_2->is_valid(), ""); 686 continue; 687 } 688 if (r_1->is_stack()) { 689 // memory to memory use rax 690 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace; 691 if (!r_2->is_valid()) { 692 // sign extend?? 693 __ movl(rax, Address(rsp, ld_off)); 694 __ movptr(Address(rsp, st_off), rax); 695 696 } else { 697 698 __ movq(rax, Address(rsp, ld_off)); 699 700 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 701 // T_DOUBLE and T_LONG use two slots in the interpreter 702 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 703 // ld_off == LSW, ld_off+wordSize == MSW 704 // st_off == MSW, next_off == LSW 705 __ movq(Address(rsp, next_off), rax); 706 #ifdef ASSERT 707 // Overwrite the unused slot with known junk 708 __ mov64(rax, CONST64(0xdeadffffdeadaaaa)); 709 __ movptr(Address(rsp, st_off), rax); 710 #endif /* ASSERT */ 711 } else { 712 __ movq(Address(rsp, st_off), rax); 713 } 714 } 715 } else if (r_1->is_Register()) { 716 Register r = r_1->as_Register(); 717 if (!r_2->is_valid()) { 718 // must be only an int (or less ) so move only 32bits to slot 719 // why not sign extend?? 720 __ movl(Address(rsp, st_off), r); 721 } else { 722 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 723 // T_DOUBLE and T_LONG use two slots in the interpreter 724 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 725 // long/double in gpr 726 #ifdef ASSERT 727 // Overwrite the unused slot with known junk 728 __ mov64(rax, CONST64(0xdeadffffdeadaaab)); 729 __ movptr(Address(rsp, st_off), rax); 730 #endif /* ASSERT */ 731 __ movq(Address(rsp, next_off), r); 732 } else { 733 __ movptr(Address(rsp, st_off), r); 734 } 735 } 736 } else { 737 assert(r_1->is_XMMRegister(), ""); 738 if (!r_2->is_valid()) { 739 // only a float use just part of the slot 740 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister()); 741 } else { 742 #ifdef ASSERT 743 // Overwrite the unused slot with known junk 744 __ mov64(rax, CONST64(0xdeadffffdeadaaac)); 745 __ movptr(Address(rsp, st_off), rax); 746 #endif /* ASSERT */ 747 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister()); 748 } 749 } 750 } 751 752 // Schedule the branch target address early. 753 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset()))); 754 __ jmp(rcx); 755 } 756 757 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg, 758 address code_start, address code_end, 759 Label& L_ok) { 760 Label L_fail; 761 __ lea(temp_reg, ExternalAddress(code_start)); 762 __ cmpptr(pc_reg, temp_reg); 763 __ jcc(Assembler::belowEqual, L_fail); 764 __ lea(temp_reg, ExternalAddress(code_end)); 765 __ cmpptr(pc_reg, temp_reg); 766 __ jcc(Assembler::below, L_ok); 767 __ bind(L_fail); 768 } 769 770 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm, 771 int total_args_passed, 772 int comp_args_on_stack, 773 const BasicType *sig_bt, 774 const VMRegPair *regs) { 775 776 // Note: r13 contains the senderSP on entry. We must preserve it since 777 // we may do a i2c -> c2i transition if we lose a race where compiled 778 // code goes non-entrant while we get args ready. 779 // In addition we use r13 to locate all the interpreter args as 780 // we must align the stack to 16 bytes on an i2c entry else we 781 // lose alignment we expect in all compiled code and register 782 // save code can segv when fxsave instructions find improperly 783 // aligned stack pointer. 784 785 // Adapters can be frameless because they do not require the caller 786 // to perform additional cleanup work, such as correcting the stack pointer. 787 // An i2c adapter is frameless because the *caller* frame, which is interpreted, 788 // routinely repairs its own stack pointer (from interpreter_frame_last_sp), 789 // even if a callee has modified the stack pointer. 790 // A c2i adapter is frameless because the *callee* frame, which is interpreted, 791 // routinely repairs its caller's stack pointer (from sender_sp, which is set 792 // up via the senderSP register). 793 // In other words, if *either* the caller or callee is interpreted, we can 794 // get the stack pointer repaired after a call. 795 // This is why c2i and i2c adapters cannot be indefinitely composed. 796 // In particular, if a c2i adapter were to somehow call an i2c adapter, 797 // both caller and callee would be compiled methods, and neither would 798 // clean up the stack pointer changes performed by the two adapters. 799 // If this happens, control eventually transfers back to the compiled 800 // caller, but with an uncorrected stack, causing delayed havoc. 801 802 if (VerifyAdapterCalls && 803 (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) { 804 // So, let's test for cascading c2i/i2c adapters right now. 805 // assert(Interpreter::contains($return_addr) || 806 // StubRoutines::contains($return_addr), 807 // "i2c adapter must return to an interpreter frame"); 808 __ block_comment("verify_i2c { "); 809 // Pick up the return address 810 __ movptr(rax, Address(rsp, 0)); 811 Label L_ok; 812 if (Interpreter::code() != nullptr) { 813 range_check(masm, rax, r11, 814 Interpreter::code()->code_start(), 815 Interpreter::code()->code_end(), 816 L_ok); 817 } 818 if (StubRoutines::initial_stubs_code() != nullptr) { 819 range_check(masm, rax, r11, 820 StubRoutines::initial_stubs_code()->code_begin(), 821 StubRoutines::initial_stubs_code()->code_end(), 822 L_ok); 823 } 824 if (StubRoutines::final_stubs_code() != nullptr) { 825 range_check(masm, rax, r11, 826 StubRoutines::final_stubs_code()->code_begin(), 827 StubRoutines::final_stubs_code()->code_end(), 828 L_ok); 829 } 830 const char* msg = "i2c adapter must return to an interpreter frame"; 831 __ block_comment(msg); 832 __ stop(msg); 833 __ bind(L_ok); 834 __ block_comment("} verify_i2ce "); 835 } 836 837 // Must preserve original SP for loading incoming arguments because 838 // we need to align the outgoing SP for compiled code. 839 __ movptr(r11, rsp); 840 841 // Pick up the return address 842 __ pop(rax); 843 844 // Convert 4-byte c2 stack slots to words. 845 int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord; 846 847 if (comp_args_on_stack) { 848 __ subptr(rsp, comp_words_on_stack * wordSize); 849 } 850 851 // Ensure compiled code always sees stack at proper alignment 852 __ andptr(rsp, -16); 853 854 // push the return address and misalign the stack that youngest frame always sees 855 // as far as the placement of the call instruction 856 __ push(rax); 857 858 // Put saved SP in another register 859 const Register saved_sp = rax; 860 __ movptr(saved_sp, r11); 861 862 // Will jump to the compiled code just as if compiled code was doing it. 863 // Pre-load the register-jump target early, to schedule it better. 864 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset()))); 865 866 #if INCLUDE_JVMCI 867 if (EnableJVMCI) { 868 // check if this call should be routed towards a specific entry point 869 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 870 Label no_alternative_target; 871 __ jcc(Assembler::equal, no_alternative_target); 872 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset()))); 873 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 874 __ bind(no_alternative_target); 875 } 876 #endif // INCLUDE_JVMCI 877 878 // Now generate the shuffle code. Pick up all register args and move the 879 // rest through the floating point stack top. 880 for (int i = 0; i < total_args_passed; i++) { 881 if (sig_bt[i] == T_VOID) { 882 // Longs and doubles are passed in native word order, but misaligned 883 // in the 32-bit build. 884 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 885 continue; 886 } 887 888 // Pick up 0, 1 or 2 words from SP+offset. 889 890 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), 891 "scrambled load targets?"); 892 // Load in argument order going down. 893 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize; 894 // Point to interpreter value (vs. tag) 895 int next_off = ld_off - Interpreter::stackElementSize; 896 // 897 // 898 // 899 VMReg r_1 = regs[i].first(); 900 VMReg r_2 = regs[i].second(); 901 if (!r_1->is_valid()) { 902 assert(!r_2->is_valid(), ""); 903 continue; 904 } 905 if (r_1->is_stack()) { 906 // Convert stack slot to an SP offset (+ wordSize to account for return address ) 907 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize; 908 909 // We can use r13 as a temp here because compiled code doesn't need r13 as an input 910 // and if we end up going thru a c2i because of a miss a reasonable value of r13 911 // will be generated. 912 if (!r_2->is_valid()) { 913 // sign extend??? 914 __ movl(r13, Address(saved_sp, ld_off)); 915 __ movptr(Address(rsp, st_off), r13); 916 } else { 917 // 918 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 919 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 920 // So we must adjust where to pick up the data to match the interpreter. 921 // 922 // Interpreter local[n] == MSW, local[n+1] == LSW however locals 923 // are accessed as negative so LSW is at LOW address 924 925 // ld_off is MSW so get LSW 926 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 927 next_off : ld_off; 928 __ movq(r13, Address(saved_sp, offset)); 929 // st_off is LSW (i.e. reg.first()) 930 __ movq(Address(rsp, st_off), r13); 931 } 932 } else if (r_1->is_Register()) { // Register argument 933 Register r = r_1->as_Register(); 934 assert(r != rax, "must be different"); 935 if (r_2->is_valid()) { 936 // 937 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 938 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 939 // So we must adjust where to pick up the data to match the interpreter. 940 941 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 942 next_off : ld_off; 943 944 // this can be a misaligned move 945 __ movq(r, Address(saved_sp, offset)); 946 } else { 947 // sign extend and use a full word? 948 __ movl(r, Address(saved_sp, ld_off)); 949 } 950 } else { 951 if (!r_2->is_valid()) { 952 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off)); 953 } else { 954 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off)); 955 } 956 } 957 } 958 959 __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about 960 961 // 6243940 We might end up in handle_wrong_method if 962 // the callee is deoptimized as we race thru here. If that 963 // happens we don't want to take a safepoint because the 964 // caller frame will look interpreted and arguments are now 965 // "compiled" so it is much better to make this transition 966 // invisible to the stack walking code. Unfortunately if 967 // we try and find the callee by normal means a safepoint 968 // is possible. So we stash the desired callee in the thread 969 // and the vm will find there should this case occur. 970 971 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx); 972 973 // put Method* where a c2i would expect should we end up there 974 // only needed because eof c2 resolve stubs return Method* as a result in 975 // rax 976 __ mov(rax, rbx); 977 __ jmp(r11); 978 } 979 980 // --------------------------------------------------------------- 981 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm, 982 int total_args_passed, 983 int comp_args_on_stack, 984 const BasicType *sig_bt, 985 const VMRegPair *regs, 986 AdapterFingerPrint* fingerprint) { 987 address i2c_entry = __ pc(); 988 989 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs); 990 991 // ------------------------------------------------------------------------- 992 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls 993 // to the interpreter. The args start out packed in the compiled layout. They 994 // need to be unpacked into the interpreter layout. This will almost always 995 // require some stack space. We grow the current (compiled) stack, then repack 996 // the args. We finally end in a jump to the generic interpreter entry point. 997 // On exit from the interpreter, the interpreter will restore our SP (lest the 998 // compiled code, which relies solely on SP and not RBP, get sick). 999 1000 address c2i_unverified_entry = __ pc(); 1001 Label skip_fixup; 1002 1003 Register data = rax; 1004 Register receiver = j_rarg0; 1005 Register temp = rbx; 1006 1007 { 1008 __ ic_check(1 /* end_alignment */); 1009 __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset())); 1010 // Method might have been compiled since the call site was patched to 1011 // interpreted if that is the case treat it as a miss so we can get 1012 // the call site corrected. 1013 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 1014 __ jcc(Assembler::equal, skip_fixup); 1015 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1016 } 1017 1018 address c2i_entry = __ pc(); 1019 1020 // Class initialization barrier for static methods 1021 address c2i_no_clinit_check_entry = nullptr; 1022 if (VM_Version::supports_fast_class_init_checks()) { 1023 Label L_skip_barrier; 1024 Register method = rbx; 1025 1026 { // Bypass the barrier for non-static methods 1027 Register flags = rscratch1; 1028 __ movl(flags, Address(method, Method::access_flags_offset())); 1029 __ testl(flags, JVM_ACC_STATIC); 1030 __ jcc(Assembler::zero, L_skip_barrier); // non-static 1031 } 1032 1033 Register klass = rscratch1; 1034 __ load_method_holder(klass, method); 1035 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1036 1037 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1038 1039 __ bind(L_skip_barrier); 1040 c2i_no_clinit_check_entry = __ pc(); 1041 } 1042 1043 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1044 bs->c2i_entry_barrier(masm); 1045 1046 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup); 1047 1048 return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry); 1049 } 1050 1051 int SharedRuntime::c_calling_convention(const BasicType *sig_bt, 1052 VMRegPair *regs, 1053 int total_args_passed) { 1054 1055 // We return the amount of VMRegImpl stack slots we need to reserve for all 1056 // the arguments NOT counting out_preserve_stack_slots. 1057 1058 // NOTE: These arrays will have to change when c1 is ported 1059 #ifdef _WIN64 1060 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1061 c_rarg0, c_rarg1, c_rarg2, c_rarg3 1062 }; 1063 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1064 c_farg0, c_farg1, c_farg2, c_farg3 1065 }; 1066 #else 1067 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1068 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5 1069 }; 1070 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1071 c_farg0, c_farg1, c_farg2, c_farg3, 1072 c_farg4, c_farg5, c_farg6, c_farg7 1073 }; 1074 #endif // _WIN64 1075 1076 1077 uint int_args = 0; 1078 uint fp_args = 0; 1079 uint stk_args = 0; // inc by 2 each time 1080 1081 for (int i = 0; i < total_args_passed; i++) { 1082 switch (sig_bt[i]) { 1083 case T_BOOLEAN: 1084 case T_CHAR: 1085 case T_BYTE: 1086 case T_SHORT: 1087 case T_INT: 1088 if (int_args < Argument::n_int_register_parameters_c) { 1089 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 1090 #ifdef _WIN64 1091 fp_args++; 1092 // Allocate slots for callee to stuff register args the stack. 1093 stk_args += 2; 1094 #endif 1095 } else { 1096 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1097 stk_args += 2; 1098 } 1099 break; 1100 case T_LONG: 1101 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1102 // fall through 1103 case T_OBJECT: 1104 case T_ARRAY: 1105 case T_ADDRESS: 1106 case T_METADATA: 1107 if (int_args < Argument::n_int_register_parameters_c) { 1108 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 1109 #ifdef _WIN64 1110 fp_args++; 1111 stk_args += 2; 1112 #endif 1113 } else { 1114 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1115 stk_args += 2; 1116 } 1117 break; 1118 case T_FLOAT: 1119 if (fp_args < Argument::n_float_register_parameters_c) { 1120 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 1121 #ifdef _WIN64 1122 int_args++; 1123 // Allocate slots for callee to stuff register args the stack. 1124 stk_args += 2; 1125 #endif 1126 } else { 1127 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1128 stk_args += 2; 1129 } 1130 break; 1131 case T_DOUBLE: 1132 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1133 if (fp_args < Argument::n_float_register_parameters_c) { 1134 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 1135 #ifdef _WIN64 1136 int_args++; 1137 // Allocate slots for callee to stuff register args the stack. 1138 stk_args += 2; 1139 #endif 1140 } else { 1141 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1142 stk_args += 2; 1143 } 1144 break; 1145 case T_VOID: // Halves of longs and doubles 1146 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 1147 regs[i].set_bad(); 1148 break; 1149 default: 1150 ShouldNotReachHere(); 1151 break; 1152 } 1153 } 1154 #ifdef _WIN64 1155 // windows abi requires that we always allocate enough stack space 1156 // for 4 64bit registers to be stored down. 1157 if (stk_args < 8) { 1158 stk_args = 8; 1159 } 1160 #endif // _WIN64 1161 1162 return stk_args; 1163 } 1164 1165 int SharedRuntime::vector_calling_convention(VMRegPair *regs, 1166 uint num_bits, 1167 uint total_args_passed) { 1168 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512, 1169 "only certain vector sizes are supported for now"); 1170 1171 static const XMMRegister VEC_ArgReg[32] = { 1172 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 1173 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1174 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23, 1175 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31 1176 }; 1177 1178 uint stk_args = 0; 1179 uint fp_args = 0; 1180 1181 for (uint i = 0; i < total_args_passed; i++) { 1182 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg(); 1183 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15)); 1184 regs[i].set_pair(vmreg->next(next_val), vmreg); 1185 } 1186 1187 return stk_args; 1188 } 1189 1190 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1191 // We always ignore the frame_slots arg and just use the space just below frame pointer 1192 // which by this time is free to use 1193 switch (ret_type) { 1194 case T_FLOAT: 1195 __ movflt(Address(rbp, -wordSize), xmm0); 1196 break; 1197 case T_DOUBLE: 1198 __ movdbl(Address(rbp, -wordSize), xmm0); 1199 break; 1200 case T_VOID: break; 1201 default: { 1202 __ movptr(Address(rbp, -wordSize), rax); 1203 } 1204 } 1205 } 1206 1207 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1208 // We always ignore the frame_slots arg and just use the space just below frame pointer 1209 // which by this time is free to use 1210 switch (ret_type) { 1211 case T_FLOAT: 1212 __ movflt(xmm0, Address(rbp, -wordSize)); 1213 break; 1214 case T_DOUBLE: 1215 __ movdbl(xmm0, Address(rbp, -wordSize)); 1216 break; 1217 case T_VOID: break; 1218 default: { 1219 __ movptr(rax, Address(rbp, -wordSize)); 1220 } 1221 } 1222 } 1223 1224 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1225 for ( int i = first_arg ; i < arg_count ; i++ ) { 1226 if (args[i].first()->is_Register()) { 1227 __ push(args[i].first()->as_Register()); 1228 } else if (args[i].first()->is_XMMRegister()) { 1229 __ subptr(rsp, 2*wordSize); 1230 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister()); 1231 } 1232 } 1233 } 1234 1235 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1236 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) { 1237 if (args[i].first()->is_Register()) { 1238 __ pop(args[i].first()->as_Register()); 1239 } else if (args[i].first()->is_XMMRegister()) { 1240 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0)); 1241 __ addptr(rsp, 2*wordSize); 1242 } 1243 } 1244 } 1245 1246 static void verify_oop_args(MacroAssembler* masm, 1247 const methodHandle& method, 1248 const BasicType* sig_bt, 1249 const VMRegPair* regs) { 1250 Register temp_reg = rbx; // not part of any compiled calling seq 1251 if (VerifyOops) { 1252 for (int i = 0; i < method->size_of_parameters(); i++) { 1253 if (is_reference_type(sig_bt[i])) { 1254 VMReg r = regs[i].first(); 1255 assert(r->is_valid(), "bad oop arg"); 1256 if (r->is_stack()) { 1257 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1258 __ verify_oop(temp_reg); 1259 } else { 1260 __ verify_oop(r->as_Register()); 1261 } 1262 } 1263 } 1264 } 1265 } 1266 1267 static void check_continuation_enter_argument(VMReg actual_vmreg, 1268 Register expected_reg, 1269 const char* name) { 1270 assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name); 1271 assert(actual_vmreg->as_Register() == expected_reg, 1272 "%s is in unexpected register: %s instead of %s", 1273 name, actual_vmreg->as_Register()->name(), expected_reg->name()); 1274 } 1275 1276 1277 //---------------------------- continuation_enter_setup --------------------------- 1278 // 1279 // Arguments: 1280 // None. 1281 // 1282 // Results: 1283 // rsp: pointer to blank ContinuationEntry 1284 // 1285 // Kills: 1286 // rax 1287 // 1288 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) { 1289 assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, ""); 1290 assert(in_bytes(ContinuationEntry::cont_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1291 assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1292 1293 stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize; 1294 __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1295 1296 int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size; 1297 OopMap* map = new OopMap(frame_size, 0); 1298 1299 __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset())); 1300 __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax); 1301 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp); 1302 1303 return map; 1304 } 1305 1306 //---------------------------- fill_continuation_entry --------------------------- 1307 // 1308 // Arguments: 1309 // rsp: pointer to blank Continuation entry 1310 // reg_cont_obj: pointer to the continuation 1311 // reg_flags: flags 1312 // 1313 // Results: 1314 // rsp: pointer to filled out ContinuationEntry 1315 // 1316 // Kills: 1317 // rax 1318 // 1319 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) { 1320 assert_different_registers(rax, reg_cont_obj, reg_flags); 1321 #ifdef ASSERT 1322 __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value()); 1323 #endif 1324 __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj); 1325 __ movl (Address(rsp, ContinuationEntry::flags_offset()), reg_flags); 1326 __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0); 1327 __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0); 1328 __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0); 1329 1330 __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset())); 1331 __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax); 1332 __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset())); 1333 __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax); 1334 1335 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0); 1336 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0); 1337 } 1338 1339 //---------------------------- continuation_enter_cleanup --------------------------- 1340 // 1341 // Arguments: 1342 // rsp: pointer to the ContinuationEntry 1343 // 1344 // Results: 1345 // rsp: pointer to the spilled rbp in the entry frame 1346 // 1347 // Kills: 1348 // rbx 1349 // 1350 void static continuation_enter_cleanup(MacroAssembler* masm) { 1351 #ifdef ASSERT 1352 Label L_good_sp; 1353 __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1354 __ jcc(Assembler::equal, L_good_sp); 1355 __ stop("Incorrect rsp at continuation_enter_cleanup"); 1356 __ bind(L_good_sp); 1357 #endif 1358 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset())); 1359 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx); 1360 1361 if (CheckJNICalls) { 1362 // Check if this is a virtual thread continuation 1363 Label L_skip_vthread_code; 1364 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0); 1365 __ jcc(Assembler::equal, L_skip_vthread_code); 1366 1367 // If the held monitor count is > 0 and this vthread is terminating then 1368 // it failed to release a JNI monitor. So we issue the same log message 1369 // that JavaThread::exit does. 1370 __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1371 __ jcc(Assembler::equal, L_skip_vthread_code); 1372 1373 // rax may hold an exception oop, save it before the call 1374 __ push(rax); 1375 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held)); 1376 __ pop(rax); 1377 1378 // For vthreads we have to explicitly zero the JNI monitor count of the carrier 1379 // on termination. The held count is implicitly zeroed below when we restore from 1380 // the parent held count (which has to be zero). 1381 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1382 1383 __ bind(L_skip_vthread_code); 1384 } 1385 #ifdef ASSERT 1386 else { 1387 // Check if this is a virtual thread continuation 1388 Label L_skip_vthread_code; 1389 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0); 1390 __ jcc(Assembler::equal, L_skip_vthread_code); 1391 1392 // See comment just above. If not checking JNI calls the JNI count is only 1393 // needed for assertion checking. 1394 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1395 1396 __ bind(L_skip_vthread_code); 1397 } 1398 #endif 1399 1400 __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset())); 1401 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx); 1402 1403 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset())); 1404 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx); 1405 __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1406 } 1407 1408 static void gen_continuation_enter(MacroAssembler* masm, 1409 const VMRegPair* regs, 1410 int& exception_offset, 1411 OopMapSet* oop_maps, 1412 int& frame_complete, 1413 int& stack_slots, 1414 int& interpreted_entry_offset, 1415 int& compiled_entry_offset) { 1416 1417 // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread) 1418 int pos_cont_obj = 0; 1419 int pos_is_cont = 1; 1420 int pos_is_virtual = 2; 1421 1422 // The platform-specific calling convention may present the arguments in various registers. 1423 // To simplify the rest of the code, we expect the arguments to reside at these known 1424 // registers, and we additionally check the placement here in case calling convention ever 1425 // changes. 1426 Register reg_cont_obj = c_rarg1; 1427 Register reg_is_cont = c_rarg2; 1428 Register reg_is_virtual = c_rarg3; 1429 1430 check_continuation_enter_argument(regs[pos_cont_obj].first(), reg_cont_obj, "Continuation object"); 1431 check_continuation_enter_argument(regs[pos_is_cont].first(), reg_is_cont, "isContinue"); 1432 check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread"); 1433 1434 // Utility methods kill rax, make sure there are no collisions 1435 assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual); 1436 1437 AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(), 1438 relocInfo::static_call_type); 1439 1440 address start = __ pc(); 1441 1442 Label L_thaw, L_exit; 1443 1444 // i2i entry used at interp_only_mode only 1445 interpreted_entry_offset = __ pc() - start; 1446 { 1447 #ifdef ASSERT 1448 Label is_interp_only; 1449 __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0); 1450 __ jcc(Assembler::notEqual, is_interp_only); 1451 __ stop("enterSpecial interpreter entry called when not in interp_only_mode"); 1452 __ bind(is_interp_only); 1453 #endif 1454 1455 __ pop(rax); // return address 1456 // Read interpreter arguments into registers (this is an ad-hoc i2c adapter) 1457 __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2)); 1458 __ movl(c_rarg2, Address(rsp, Interpreter::stackElementSize*1)); 1459 __ movl(c_rarg3, Address(rsp, Interpreter::stackElementSize*0)); 1460 __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment 1461 __ push(rax); // return address 1462 __ push_cont_fastpath(); 1463 1464 __ enter(); 1465 1466 stack_slots = 2; // will be adjusted in setup 1467 OopMap* map = continuation_enter_setup(masm, stack_slots); 1468 // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe, 1469 // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway. 1470 1471 __ verify_oop(reg_cont_obj); 1472 1473 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1474 1475 // If continuation, call to thaw. Otherwise, resolve the call and exit. 1476 __ testptr(reg_is_cont, reg_is_cont); 1477 __ jcc(Assembler::notZero, L_thaw); 1478 1479 // --- Resolve path 1480 1481 // Make sure the call is patchable 1482 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1483 // Emit stub for static call 1484 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc()); 1485 if (stub == nullptr) { 1486 fatal("CodeCache is full at gen_continuation_enter"); 1487 } 1488 __ call(resolve); 1489 oop_maps->add_gc_map(__ pc() - start, map); 1490 __ post_call_nop(); 1491 1492 __ jmp(L_exit); 1493 } 1494 1495 // compiled entry 1496 __ align(CodeEntryAlignment); 1497 compiled_entry_offset = __ pc() - start; 1498 __ enter(); 1499 1500 stack_slots = 2; // will be adjusted in setup 1501 OopMap* map = continuation_enter_setup(masm, stack_slots); 1502 1503 // Frame is now completed as far as size and linkage. 1504 frame_complete = __ pc() - start; 1505 1506 __ verify_oop(reg_cont_obj); 1507 1508 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1509 1510 // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue) 1511 __ testptr(reg_is_cont, reg_is_cont); 1512 __ jccb(Assembler::notZero, L_thaw); 1513 1514 // --- call Continuation.enter(Continuation c, boolean isContinue) 1515 1516 // Make sure the call is patchable 1517 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1518 1519 // Emit stub for static call 1520 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc()); 1521 if (stub == nullptr) { 1522 fatal("CodeCache is full at gen_continuation_enter"); 1523 } 1524 1525 // The call needs to be resolved. There's a special case for this in 1526 // SharedRuntime::find_callee_info_helper() which calls 1527 // LinkResolver::resolve_continuation_enter() which resolves the call to 1528 // Continuation.enter(Continuation c, boolean isContinue). 1529 __ call(resolve); 1530 1531 oop_maps->add_gc_map(__ pc() - start, map); 1532 __ post_call_nop(); 1533 1534 __ jmpb(L_exit); 1535 1536 // --- Thawing path 1537 1538 __ bind(L_thaw); 1539 1540 __ call(RuntimeAddress(StubRoutines::cont_thaw())); 1541 1542 ContinuationEntry::_return_pc_offset = __ pc() - start; 1543 oop_maps->add_gc_map(__ pc() - start, map->deep_copy()); 1544 __ post_call_nop(); 1545 1546 // --- Normal exit (resolve/thawing) 1547 1548 __ bind(L_exit); 1549 1550 continuation_enter_cleanup(masm); 1551 __ pop(rbp); 1552 __ ret(0); 1553 1554 // --- Exception handling path 1555 1556 exception_offset = __ pc() - start; 1557 1558 continuation_enter_cleanup(masm); 1559 __ pop(rbp); 1560 1561 __ movptr(c_rarg0, r15_thread); 1562 __ movptr(c_rarg1, Address(rsp, 0)); // return address 1563 1564 // rax still holds the original exception oop, save it before the call 1565 __ push(rax); 1566 1567 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2); 1568 __ movptr(rbx, rax); 1569 1570 // Continue at exception handler: 1571 // rax: exception oop 1572 // rbx: exception handler 1573 // rdx: exception pc 1574 __ pop(rax); 1575 __ verify_oop(rax); 1576 __ pop(rdx); 1577 __ jmp(rbx); 1578 } 1579 1580 static void gen_continuation_yield(MacroAssembler* masm, 1581 const VMRegPair* regs, 1582 OopMapSet* oop_maps, 1583 int& frame_complete, 1584 int& stack_slots, 1585 int& compiled_entry_offset) { 1586 enum layout { 1587 rbp_off, 1588 rbpH_off, 1589 return_off, 1590 return_off2, 1591 framesize // inclusive of return address 1592 }; 1593 stack_slots = framesize / VMRegImpl::slots_per_word; 1594 assert(stack_slots == 2, "recheck layout"); 1595 1596 address start = __ pc(); 1597 compiled_entry_offset = __ pc() - start; 1598 __ enter(); 1599 address the_pc = __ pc(); 1600 1601 frame_complete = the_pc - start; 1602 1603 // This nop must be exactly at the PC we push into the frame info. 1604 // We use this nop for fast CodeBlob lookup, associate the OopMap 1605 // with it right away. 1606 __ post_call_nop(); 1607 OopMap* map = new OopMap(framesize, 1); 1608 oop_maps->add_gc_map(frame_complete, map); 1609 1610 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 1611 __ movptr(c_rarg0, r15_thread); 1612 __ movptr(c_rarg1, rsp); 1613 __ call_VM_leaf(Continuation::freeze_entry(), 2); 1614 __ reset_last_Java_frame(true); 1615 1616 Label L_pinned; 1617 1618 __ testptr(rax, rax); 1619 __ jcc(Assembler::notZero, L_pinned); 1620 1621 __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1622 continuation_enter_cleanup(masm); 1623 __ pop(rbp); 1624 __ ret(0); 1625 1626 __ bind(L_pinned); 1627 1628 // Pinned, return to caller 1629 1630 // handle pending exception thrown by freeze 1631 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 1632 Label ok; 1633 __ jcc(Assembler::equal, ok); 1634 __ leave(); 1635 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 1636 __ bind(ok); 1637 1638 __ leave(); 1639 __ ret(0); 1640 } 1641 1642 static void gen_special_dispatch(MacroAssembler* masm, 1643 const methodHandle& method, 1644 const BasicType* sig_bt, 1645 const VMRegPair* regs) { 1646 verify_oop_args(masm, method, sig_bt, regs); 1647 vmIntrinsics::ID iid = method->intrinsic_id(); 1648 1649 // Now write the args into the outgoing interpreter space 1650 bool has_receiver = false; 1651 Register receiver_reg = noreg; 1652 int member_arg_pos = -1; 1653 Register member_reg = noreg; 1654 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid); 1655 if (ref_kind != 0) { 1656 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument 1657 member_reg = rbx; // known to be free at this point 1658 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind); 1659 } else if (iid == vmIntrinsics::_invokeBasic) { 1660 has_receiver = true; 1661 } else if (iid == vmIntrinsics::_linkToNative) { 1662 member_arg_pos = method->size_of_parameters() - 1; // trailing NativeEntryPoint argument 1663 member_reg = rbx; // known to be free at this point 1664 } else { 1665 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid)); 1666 } 1667 1668 if (member_reg != noreg) { 1669 // Load the member_arg into register, if necessary. 1670 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs); 1671 VMReg r = regs[member_arg_pos].first(); 1672 if (r->is_stack()) { 1673 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1674 } else { 1675 // no data motion is needed 1676 member_reg = r->as_Register(); 1677 } 1678 } 1679 1680 if (has_receiver) { 1681 // Make sure the receiver is loaded into a register. 1682 assert(method->size_of_parameters() > 0, "oob"); 1683 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object"); 1684 VMReg r = regs[0].first(); 1685 assert(r->is_valid(), "bad receiver arg"); 1686 if (r->is_stack()) { 1687 // Porting note: This assumes that compiled calling conventions always 1688 // pass the receiver oop in a register. If this is not true on some 1689 // platform, pick a temp and load the receiver from stack. 1690 fatal("receiver always in a register"); 1691 receiver_reg = j_rarg0; // known to be free at this point 1692 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1693 } else { 1694 // no data motion is needed 1695 receiver_reg = r->as_Register(); 1696 } 1697 } 1698 1699 // Figure out which address we are really jumping to: 1700 MethodHandles::generate_method_handle_dispatch(masm, iid, 1701 receiver_reg, member_reg, /*for_compiler_entry:*/ true); 1702 } 1703 1704 // --------------------------------------------------------------------------- 1705 // Generate a native wrapper for a given method. The method takes arguments 1706 // in the Java compiled code convention, marshals them to the native 1707 // convention (handlizes oops, etc), transitions to native, makes the call, 1708 // returns to java state (possibly blocking), unhandlizes any result and 1709 // returns. 1710 // 1711 // Critical native functions are a shorthand for the use of 1712 // GetPrimtiveArrayCritical and disallow the use of any other JNI 1713 // functions. The wrapper is expected to unpack the arguments before 1714 // passing them to the callee. Critical native functions leave the state _in_Java, 1715 // since they cannot stop for GC. 1716 // Some other parts of JNI setup are skipped like the tear down of the JNI handle 1717 // block and the check for pending exceptions it's impossible for them 1718 // to be thrown. 1719 // 1720 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm, 1721 const methodHandle& method, 1722 int compile_id, 1723 BasicType* in_sig_bt, 1724 VMRegPair* in_regs, 1725 BasicType ret_type) { 1726 if (method->is_continuation_native_intrinsic()) { 1727 int exception_offset = -1; 1728 OopMapSet* oop_maps = new OopMapSet(); 1729 int frame_complete = -1; 1730 int stack_slots = -1; 1731 int interpreted_entry_offset = -1; 1732 int vep_offset = -1; 1733 if (method->is_continuation_enter_intrinsic()) { 1734 gen_continuation_enter(masm, 1735 in_regs, 1736 exception_offset, 1737 oop_maps, 1738 frame_complete, 1739 stack_slots, 1740 interpreted_entry_offset, 1741 vep_offset); 1742 } else if (method->is_continuation_yield_intrinsic()) { 1743 gen_continuation_yield(masm, 1744 in_regs, 1745 oop_maps, 1746 frame_complete, 1747 stack_slots, 1748 vep_offset); 1749 } else { 1750 guarantee(false, "Unknown Continuation native intrinsic"); 1751 } 1752 1753 #ifdef ASSERT 1754 if (method->is_continuation_enter_intrinsic()) { 1755 assert(interpreted_entry_offset != -1, "Must be set"); 1756 assert(exception_offset != -1, "Must be set"); 1757 } else { 1758 assert(interpreted_entry_offset == -1, "Must be unset"); 1759 assert(exception_offset == -1, "Must be unset"); 1760 } 1761 assert(frame_complete != -1, "Must be set"); 1762 assert(stack_slots != -1, "Must be set"); 1763 assert(vep_offset != -1, "Must be set"); 1764 #endif 1765 1766 __ flush(); 1767 nmethod* nm = nmethod::new_native_nmethod(method, 1768 compile_id, 1769 masm->code(), 1770 vep_offset, 1771 frame_complete, 1772 stack_slots, 1773 in_ByteSize(-1), 1774 in_ByteSize(-1), 1775 oop_maps, 1776 exception_offset); 1777 if (nm == nullptr) return nm; 1778 if (method->is_continuation_enter_intrinsic()) { 1779 ContinuationEntry::set_enter_code(nm, interpreted_entry_offset); 1780 } else if (method->is_continuation_yield_intrinsic()) { 1781 _cont_doYield_stub = nm; 1782 } 1783 return nm; 1784 } 1785 1786 if (method->is_method_handle_intrinsic()) { 1787 vmIntrinsics::ID iid = method->intrinsic_id(); 1788 intptr_t start = (intptr_t)__ pc(); 1789 int vep_offset = ((intptr_t)__ pc()) - start; 1790 gen_special_dispatch(masm, 1791 method, 1792 in_sig_bt, 1793 in_regs); 1794 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period 1795 __ flush(); 1796 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually 1797 return nmethod::new_native_nmethod(method, 1798 compile_id, 1799 masm->code(), 1800 vep_offset, 1801 frame_complete, 1802 stack_slots / VMRegImpl::slots_per_word, 1803 in_ByteSize(-1), 1804 in_ByteSize(-1), 1805 nullptr); 1806 } 1807 address native_func = method->native_function(); 1808 assert(native_func != nullptr, "must have function"); 1809 1810 // An OopMap for lock (and class if static) 1811 OopMapSet *oop_maps = new OopMapSet(); 1812 intptr_t start = (intptr_t)__ pc(); 1813 1814 // We have received a description of where all the java arg are located 1815 // on entry to the wrapper. We need to convert these args to where 1816 // the jni function will expect them. To figure out where they go 1817 // we convert the java signature to a C signature by inserting 1818 // the hidden arguments as arg[0] and possibly arg[1] (static method) 1819 1820 const int total_in_args = method->size_of_parameters(); 1821 int total_c_args = total_in_args + (method->is_static() ? 2 : 1); 1822 1823 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args); 1824 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args); 1825 BasicType* in_elem_bt = nullptr; 1826 1827 int argc = 0; 1828 out_sig_bt[argc++] = T_ADDRESS; 1829 if (method->is_static()) { 1830 out_sig_bt[argc++] = T_OBJECT; 1831 } 1832 1833 for (int i = 0; i < total_in_args ; i++ ) { 1834 out_sig_bt[argc++] = in_sig_bt[i]; 1835 } 1836 1837 // Now figure out where the args must be stored and how much stack space 1838 // they require. 1839 int out_arg_slots; 1840 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args); 1841 1842 // Compute framesize for the wrapper. We need to handlize all oops in 1843 // incoming registers 1844 1845 // Calculate the total number of stack slots we will need. 1846 1847 // First count the abi requirement plus all of the outgoing args 1848 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots; 1849 1850 // Now the space for the inbound oop handle area 1851 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers 1852 1853 int oop_handle_offset = stack_slots; 1854 stack_slots += total_save_slots; 1855 1856 // Now any space we need for handlizing a klass if static method 1857 1858 int klass_slot_offset = 0; 1859 int klass_offset = -1; 1860 int lock_slot_offset = 0; 1861 bool is_static = false; 1862 1863 if (method->is_static()) { 1864 klass_slot_offset = stack_slots; 1865 stack_slots += VMRegImpl::slots_per_word; 1866 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size; 1867 is_static = true; 1868 } 1869 1870 // Plus a lock if needed 1871 1872 if (method->is_synchronized()) { 1873 lock_slot_offset = stack_slots; 1874 stack_slots += VMRegImpl::slots_per_word; 1875 } 1876 1877 // Now a place (+2) to save return values or temp during shuffling 1878 // + 4 for return address (which we own) and saved rbp 1879 stack_slots += 6; 1880 1881 // Ok The space we have allocated will look like: 1882 // 1883 // 1884 // FP-> | | 1885 // |---------------------| 1886 // | 2 slots for moves | 1887 // |---------------------| 1888 // | lock box (if sync) | 1889 // |---------------------| <- lock_slot_offset 1890 // | klass (if static) | 1891 // |---------------------| <- klass_slot_offset 1892 // | oopHandle area | 1893 // |---------------------| <- oop_handle_offset (6 java arg registers) 1894 // | outbound memory | 1895 // | based arguments | 1896 // | | 1897 // |---------------------| 1898 // | | 1899 // SP-> | out_preserved_slots | 1900 // 1901 // 1902 1903 1904 // Now compute actual number of stack words we need rounding to make 1905 // stack properly aligned. 1906 stack_slots = align_up(stack_slots, StackAlignmentInSlots); 1907 1908 int stack_size = stack_slots * VMRegImpl::stack_slot_size; 1909 1910 // First thing make an ic check to see if we should even be here 1911 1912 // We are free to use all registers as temps without saving them and 1913 // restoring them except rbp. rbp is the only callee save register 1914 // as far as the interpreter and the compiler(s) are concerned. 1915 1916 const Register receiver = j_rarg0; 1917 1918 Label exception_pending; 1919 1920 assert_different_registers(receiver, rscratch1, rscratch2); 1921 __ verify_oop(receiver); 1922 __ ic_check(8 /* end_alignment */); 1923 1924 int vep_offset = ((intptr_t)__ pc()) - start; 1925 1926 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) { 1927 Label L_skip_barrier; 1928 Register klass = r10; 1929 __ mov_metadata(klass, method->method_holder()); // InstanceKlass* 1930 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1931 1932 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1933 1934 __ bind(L_skip_barrier); 1935 } 1936 1937 #ifdef COMPILER1 1938 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available. 1939 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) { 1940 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/); 1941 } 1942 #endif // COMPILER1 1943 1944 // The instruction at the verified entry point must be 5 bytes or longer 1945 // because it can be patched on the fly by make_non_entrant. The stack bang 1946 // instruction fits that requirement. 1947 1948 // Generate stack overflow check 1949 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size()); 1950 1951 // Generate a new frame for the wrapper. 1952 __ enter(); 1953 // -2 because return address is already present and so is saved rbp 1954 __ subptr(rsp, stack_size - 2*wordSize); 1955 1956 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1957 // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub 1958 bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */); 1959 1960 // Frame is now completed as far as size and linkage. 1961 int frame_complete = ((intptr_t)__ pc()) - start; 1962 1963 if (UseRTMLocking) { 1964 // Abort RTM transaction before calling JNI 1965 // because critical section will be large and will be 1966 // aborted anyway. Also nmethod could be deoptimized. 1967 __ xabort(0); 1968 } 1969 1970 #ifdef ASSERT 1971 __ check_stack_alignment(rsp, "improperly aligned stack"); 1972 #endif /* ASSERT */ 1973 1974 1975 // We use r14 as the oop handle for the receiver/klass 1976 // It is callee save so it survives the call to native 1977 1978 const Register oop_handle_reg = r14; 1979 1980 // 1981 // We immediately shuffle the arguments so that any vm call we have to 1982 // make from here on out (sync slow path, jvmti, etc.) we will have 1983 // captured the oops from our caller and have a valid oopMap for 1984 // them. 1985 1986 // ----------------- 1987 // The Grand Shuffle 1988 1989 // The Java calling convention is either equal (linux) or denser (win64) than the 1990 // c calling convention. However the because of the jni_env argument the c calling 1991 // convention always has at least one more (and two for static) arguments than Java. 1992 // Therefore if we move the args from java -> c backwards then we will never have 1993 // a register->register conflict and we don't have to build a dependency graph 1994 // and figure out how to break any cycles. 1995 // 1996 1997 // Record esp-based slot for receiver on stack for non-static methods 1998 int receiver_offset = -1; 1999 2000 // This is a trick. We double the stack slots so we can claim 2001 // the oops in the caller's frame. Since we are sure to have 2002 // more args than the caller doubling is enough to make 2003 // sure we can capture all the incoming oop args from the 2004 // caller. 2005 // 2006 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/); 2007 2008 // Mark location of rbp (someday) 2009 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp)); 2010 2011 // Use eax, ebx as temporaries during any memory-memory moves we have to do 2012 // All inbound args are referenced based on rbp and all outbound args via rsp. 2013 2014 2015 #ifdef ASSERT 2016 bool reg_destroyed[Register::number_of_registers]; 2017 bool freg_destroyed[XMMRegister::number_of_registers]; 2018 for ( int r = 0 ; r < Register::number_of_registers ; r++ ) { 2019 reg_destroyed[r] = false; 2020 } 2021 for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) { 2022 freg_destroyed[f] = false; 2023 } 2024 2025 #endif /* ASSERT */ 2026 2027 // For JNI natives the incoming and outgoing registers are offset upwards. 2028 GrowableArray<int> arg_order(2 * total_in_args); 2029 2030 VMRegPair tmp_vmreg; 2031 tmp_vmreg.set2(rbx->as_VMReg()); 2032 2033 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 2034 arg_order.push(i); 2035 arg_order.push(c_arg); 2036 } 2037 2038 int temploc = -1; 2039 for (int ai = 0; ai < arg_order.length(); ai += 2) { 2040 int i = arg_order.at(ai); 2041 int c_arg = arg_order.at(ai + 1); 2042 __ block_comment(err_msg("move %d -> %d", i, c_arg)); 2043 #ifdef ASSERT 2044 if (in_regs[i].first()->is_Register()) { 2045 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!"); 2046 } else if (in_regs[i].first()->is_XMMRegister()) { 2047 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!"); 2048 } 2049 if (out_regs[c_arg].first()->is_Register()) { 2050 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 2051 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 2052 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 2053 } 2054 #endif /* ASSERT */ 2055 switch (in_sig_bt[i]) { 2056 case T_ARRAY: 2057 case T_OBJECT: 2058 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg], 2059 ((i == 0) && (!is_static)), 2060 &receiver_offset); 2061 break; 2062 case T_VOID: 2063 break; 2064 2065 case T_FLOAT: 2066 __ float_move(in_regs[i], out_regs[c_arg]); 2067 break; 2068 2069 case T_DOUBLE: 2070 assert( i + 1 < total_in_args && 2071 in_sig_bt[i + 1] == T_VOID && 2072 out_sig_bt[c_arg+1] == T_VOID, "bad arg list"); 2073 __ double_move(in_regs[i], out_regs[c_arg]); 2074 break; 2075 2076 case T_LONG : 2077 __ long_move(in_regs[i], out_regs[c_arg]); 2078 break; 2079 2080 case T_ADDRESS: assert(false, "found T_ADDRESS in java args"); 2081 2082 default: 2083 __ move32_64(in_regs[i], out_regs[c_arg]); 2084 } 2085 } 2086 2087 int c_arg; 2088 2089 // Pre-load a static method's oop into r14. Used both by locking code and 2090 // the normal JNI call code. 2091 // point c_arg at the first arg that is already loaded in case we 2092 // need to spill before we call out 2093 c_arg = total_c_args - total_in_args; 2094 2095 if (method->is_static()) { 2096 2097 // load oop into a register 2098 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror())); 2099 2100 // Now handlize the static class mirror it's known not-null. 2101 __ movptr(Address(rsp, klass_offset), oop_handle_reg); 2102 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset)); 2103 2104 // Now get the handle 2105 __ lea(oop_handle_reg, Address(rsp, klass_offset)); 2106 // store the klass handle as second argument 2107 __ movptr(c_rarg1, oop_handle_reg); 2108 // and protect the arg if we must spill 2109 c_arg--; 2110 } 2111 2112 // Change state to native (we save the return address in the thread, since it might not 2113 // be pushed on the stack when we do a stack traversal). It is enough that the pc() 2114 // points into the right code segment. It does not have to be the correct return pc. 2115 // We use the same pc/oopMap repeatedly when we call out 2116 2117 intptr_t the_pc = (intptr_t) __ pc(); 2118 oop_maps->add_gc_map(the_pc - start, map); 2119 2120 __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1); 2121 2122 2123 // We have all of the arguments setup at this point. We must not touch any register 2124 // argument registers at this point (what if we save/restore them there are no oop? 2125 2126 { 2127 SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1); 2128 // protect the args we've loaded 2129 save_args(masm, total_c_args, c_arg, out_regs); 2130 __ mov_metadata(c_rarg1, method()); 2131 __ call_VM_leaf( 2132 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry), 2133 r15_thread, c_rarg1); 2134 restore_args(masm, total_c_args, c_arg, out_regs); 2135 } 2136 2137 // RedefineClasses() tracing support for obsolete method entry 2138 if (log_is_enabled(Trace, redefine, class, obsolete)) { 2139 // protect the args we've loaded 2140 save_args(masm, total_c_args, c_arg, out_regs); 2141 __ mov_metadata(c_rarg1, method()); 2142 __ call_VM_leaf( 2143 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry), 2144 r15_thread, c_rarg1); 2145 restore_args(masm, total_c_args, c_arg, out_regs); 2146 } 2147 2148 // Lock a synchronized method 2149 2150 // Register definitions used by locking and unlocking 2151 2152 const Register swap_reg = rax; // Must use rax for cmpxchg instruction 2153 const Register obj_reg = rbx; // Will contain the oop 2154 const Register lock_reg = r13; // Address of compiler lock object (BasicLock) 2155 const Register old_hdr = r13; // value of old header at unlock time 2156 2157 Label slow_path_lock; 2158 Label lock_done; 2159 2160 if (method->is_synchronized()) { 2161 Label count_mon; 2162 2163 const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes(); 2164 2165 // Get the handle (the 2nd argument) 2166 __ mov(oop_handle_reg, c_rarg1); 2167 2168 // Get address of the box 2169 2170 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2171 2172 // Load the oop from the handle 2173 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2174 2175 if (LockingMode == LM_MONITOR) { 2176 __ jmp(slow_path_lock); 2177 } else if (LockingMode == LM_LEGACY) { 2178 // Load immediate 1 into swap_reg %rax 2179 __ movl(swap_reg, 1); 2180 2181 // Load (object->mark() | 1) into swap_reg %rax 2182 __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2183 2184 // Save (object->mark() | 1) into BasicLock's displaced header 2185 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2186 2187 // src -> dest iff dest == rax else rax <- dest 2188 __ lock(); 2189 __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2190 __ jcc(Assembler::equal, count_mon); 2191 2192 // Hmm should this move to the slow path code area??? 2193 2194 // Test if the oopMark is an obvious stack pointer, i.e., 2195 // 1) (mark & 3) == 0, and 2196 // 2) rsp <= mark < mark + os::pagesize() 2197 // These 3 tests can be done by evaluating the following 2198 // expression: ((mark - rsp) & (3 - os::vm_page_size())), 2199 // assuming both stack pointer and pagesize have their 2200 // least significant 2 bits clear. 2201 // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg 2202 2203 __ subptr(swap_reg, rsp); 2204 __ andptr(swap_reg, 3 - (int)os::vm_page_size()); 2205 2206 // Save the test result, for recursive case, the result is zero 2207 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2208 __ jcc(Assembler::notEqual, slow_path_lock); 2209 } else { 2210 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2211 __ lightweight_lock(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock); 2212 } 2213 __ bind(count_mon); 2214 __ inc_held_monitor_count(); 2215 2216 // Slow path will re-enter here 2217 __ bind(lock_done); 2218 } 2219 2220 // Finally just about ready to make the JNI call 2221 2222 // get JNIEnv* which is first argument to native 2223 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset()))); 2224 2225 // Now set thread in native 2226 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 2227 2228 __ call(RuntimeAddress(native_func)); 2229 2230 // Verify or restore cpu control state after JNI call 2231 __ restore_cpu_control_state_after_jni(rscratch1); 2232 2233 // Unpack native results. 2234 switch (ret_type) { 2235 case T_BOOLEAN: __ c2bool(rax); break; 2236 case T_CHAR : __ movzwl(rax, rax); break; 2237 case T_BYTE : __ sign_extend_byte (rax); break; 2238 case T_SHORT : __ sign_extend_short(rax); break; 2239 case T_INT : /* nothing to do */ break; 2240 case T_DOUBLE : 2241 case T_FLOAT : 2242 // Result is in xmm0 we'll save as needed 2243 break; 2244 case T_ARRAY: // Really a handle 2245 case T_OBJECT: // Really a handle 2246 break; // can't de-handlize until after safepoint check 2247 case T_VOID: break; 2248 case T_LONG: break; 2249 default : ShouldNotReachHere(); 2250 } 2251 2252 Label after_transition; 2253 2254 // Switch thread to "native transition" state before reading the synchronization state. 2255 // This additional state is necessary because reading and testing the synchronization 2256 // state is not atomic w.r.t. GC, as this scenario demonstrates: 2257 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted. 2258 // VM thread changes sync state to synchronizing and suspends threads for GC. 2259 // Thread A is resumed to finish this native method, but doesn't block here since it 2260 // didn't see any synchronization is progress, and escapes. 2261 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 2262 2263 // Force this write out before the read below 2264 if (!UseSystemMemoryBarrier) { 2265 __ membar(Assembler::Membar_mask_bits( 2266 Assembler::LoadLoad | Assembler::LoadStore | 2267 Assembler::StoreLoad | Assembler::StoreStore)); 2268 } 2269 2270 // check for safepoint operation in progress and/or pending suspend requests 2271 { 2272 Label Continue; 2273 Label slow_path; 2274 2275 __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */); 2276 2277 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2278 __ jcc(Assembler::equal, Continue); 2279 __ bind(slow_path); 2280 2281 // Don't use call_VM as it will see a possible pending exception and forward it 2282 // and never return here preventing us from clearing _last_native_pc down below. 2283 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are 2284 // preserved and correspond to the bcp/locals pointers. So we do a runtime call 2285 // by hand. 2286 // 2287 __ vzeroupper(); 2288 save_native_result(masm, ret_type, stack_slots); 2289 __ mov(c_rarg0, r15_thread); 2290 __ mov(r12, rsp); // remember sp 2291 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2292 __ andptr(rsp, -16); // align stack as required by ABI 2293 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 2294 __ mov(rsp, r12); // restore sp 2295 __ reinit_heapbase(); 2296 // Restore any method result value 2297 restore_native_result(masm, ret_type, stack_slots); 2298 __ bind(Continue); 2299 } 2300 2301 // change thread state 2302 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 2303 __ bind(after_transition); 2304 2305 Label reguard; 2306 Label reguard_done; 2307 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 2308 __ jcc(Assembler::equal, reguard); 2309 __ bind(reguard_done); 2310 2311 // native result if any is live 2312 2313 // Unlock 2314 Label slow_path_unlock; 2315 Label unlock_done; 2316 if (method->is_synchronized()) { 2317 2318 Label fast_done; 2319 2320 // Get locked oop from the handle we passed to jni 2321 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2322 2323 if (LockingMode == LM_LEGACY) { 2324 Label not_recur; 2325 // Simple recursive lock? 2326 __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD); 2327 __ jcc(Assembler::notEqual, not_recur); 2328 __ dec_held_monitor_count(); 2329 __ jmpb(fast_done); 2330 __ bind(not_recur); 2331 } 2332 2333 // Must save rax if it is live now because cmpxchg must use it 2334 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2335 save_native_result(masm, ret_type, stack_slots); 2336 } 2337 2338 if (LockingMode == LM_MONITOR) { 2339 __ jmp(slow_path_unlock); 2340 } else if (LockingMode == LM_LEGACY) { 2341 // get address of the stack lock 2342 __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2343 // get old displaced header 2344 __ movptr(old_hdr, Address(rax, 0)); 2345 2346 // Atomic swap old header if oop still contains the stack lock 2347 __ lock(); 2348 __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2349 __ jcc(Assembler::notEqual, slow_path_unlock); 2350 __ dec_held_monitor_count(); 2351 } else { 2352 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2353 __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock); 2354 __ dec_held_monitor_count(); 2355 } 2356 2357 // slow path re-enters here 2358 __ bind(unlock_done); 2359 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2360 restore_native_result(masm, ret_type, stack_slots); 2361 } 2362 2363 __ bind(fast_done); 2364 } 2365 { 2366 SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1); 2367 save_native_result(masm, ret_type, stack_slots); 2368 __ mov_metadata(c_rarg1, method()); 2369 __ call_VM_leaf( 2370 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit), 2371 r15_thread, c_rarg1); 2372 restore_native_result(masm, ret_type, stack_slots); 2373 } 2374 2375 __ reset_last_Java_frame(false); 2376 2377 // Unbox oop result, e.g. JNIHandles::resolve value. 2378 if (is_reference_type(ret_type)) { 2379 __ resolve_jobject(rax /* value */, 2380 r15_thread /* thread */, 2381 rcx /* tmp */); 2382 } 2383 2384 if (CheckJNICalls) { 2385 // clear_pending_jni_exception_check 2386 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD); 2387 } 2388 2389 // reset handle block 2390 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset())); 2391 __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD); 2392 2393 // pop our frame 2394 2395 __ leave(); 2396 2397 // Any exception pending? 2398 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2399 __ jcc(Assembler::notEqual, exception_pending); 2400 2401 // Return 2402 2403 __ ret(0); 2404 2405 // Unexpected paths are out of line and go here 2406 2407 // forward the exception 2408 __ bind(exception_pending); 2409 2410 // and forward the exception 2411 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 2412 2413 // Slow path locking & unlocking 2414 if (method->is_synchronized()) { 2415 2416 // BEGIN Slow path lock 2417 __ bind(slow_path_lock); 2418 2419 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM 2420 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2421 2422 // protect the args we've loaded 2423 save_args(masm, total_c_args, c_arg, out_regs); 2424 2425 __ mov(c_rarg0, obj_reg); 2426 __ mov(c_rarg1, lock_reg); 2427 __ mov(c_rarg2, r15_thread); 2428 2429 // Not a leaf but we have last_Java_frame setup as we want 2430 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3); 2431 restore_args(masm, total_c_args, c_arg, out_regs); 2432 2433 #ifdef ASSERT 2434 { Label L; 2435 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2436 __ jcc(Assembler::equal, L); 2437 __ stop("no pending exception allowed on exit from monitorenter"); 2438 __ bind(L); 2439 } 2440 #endif 2441 __ jmp(lock_done); 2442 2443 // END Slow path lock 2444 2445 // BEGIN Slow path unlock 2446 __ bind(slow_path_unlock); 2447 2448 // If we haven't already saved the native result we must save it now as xmm registers 2449 // are still exposed. 2450 __ vzeroupper(); 2451 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2452 save_native_result(masm, ret_type, stack_slots); 2453 } 2454 2455 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2456 2457 __ mov(c_rarg0, obj_reg); 2458 __ mov(c_rarg2, r15_thread); 2459 __ mov(r12, rsp); // remember sp 2460 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2461 __ andptr(rsp, -16); // align stack as required by ABI 2462 2463 // Save pending exception around call to VM (which contains an EXCEPTION_MARK) 2464 // NOTE that obj_reg == rbx currently 2465 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset()))); 2466 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2467 2468 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2469 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C))); 2470 __ mov(rsp, r12); // restore sp 2471 __ reinit_heapbase(); 2472 #ifdef ASSERT 2473 { 2474 Label L; 2475 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2476 __ jcc(Assembler::equal, L); 2477 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C"); 2478 __ bind(L); 2479 } 2480 #endif /* ASSERT */ 2481 2482 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx); 2483 2484 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2485 restore_native_result(masm, ret_type, stack_slots); 2486 } 2487 __ jmp(unlock_done); 2488 2489 // END Slow path unlock 2490 2491 } // synchronized 2492 2493 // SLOW PATH Reguard the stack if needed 2494 2495 __ bind(reguard); 2496 __ vzeroupper(); 2497 save_native_result(masm, ret_type, stack_slots); 2498 __ mov(r12, rsp); // remember sp 2499 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2500 __ andptr(rsp, -16); // align stack as required by ABI 2501 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 2502 __ mov(rsp, r12); // restore sp 2503 __ reinit_heapbase(); 2504 restore_native_result(masm, ret_type, stack_slots); 2505 // and continue 2506 __ jmp(reguard_done); 2507 2508 2509 2510 __ flush(); 2511 2512 nmethod *nm = nmethod::new_native_nmethod(method, 2513 compile_id, 2514 masm->code(), 2515 vep_offset, 2516 frame_complete, 2517 stack_slots / VMRegImpl::slots_per_word, 2518 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)), 2519 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size), 2520 oop_maps); 2521 2522 return nm; 2523 } 2524 2525 // this function returns the adjust size (in number of words) to a c2i adapter 2526 // activation for use during deoptimization 2527 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) { 2528 return (callee_locals - callee_parameters) * Interpreter::stackElementWords; 2529 } 2530 2531 2532 uint SharedRuntime::out_preserve_stack_slots() { 2533 return 0; 2534 } 2535 2536 2537 // Number of stack slots between incoming argument block and the start of 2538 // a new frame. The PROLOG must add this many slots to the stack. The 2539 // EPILOG must remove this many slots. amd64 needs two slots for 2540 // return address. 2541 uint SharedRuntime::in_preserve_stack_slots() { 2542 return 4 + 2 * VerifyStackAtCalls; 2543 } 2544 2545 //------------------------------generate_deopt_blob---------------------------- 2546 void SharedRuntime::generate_deopt_blob() { 2547 // Allocate space for the code 2548 ResourceMark rm; 2549 // Setup code generation tools 2550 int pad = 0; 2551 if (UseAVX > 2) { 2552 pad += 1024; 2553 } 2554 #if INCLUDE_JVMCI 2555 if (EnableJVMCI) { 2556 pad += 512; // Increase the buffer size when compiling for JVMCI 2557 } 2558 #endif 2559 CodeBuffer buffer("deopt_blob", 2560+pad, 1024); 2560 MacroAssembler* masm = new MacroAssembler(&buffer); 2561 int frame_size_in_words; 2562 OopMap* map = nullptr; 2563 OopMapSet *oop_maps = new OopMapSet(); 2564 2565 // ------------- 2566 // This code enters when returning to a de-optimized nmethod. A return 2567 // address has been pushed on the stack, and return values are in 2568 // registers. 2569 // If we are doing a normal deopt then we were called from the patched 2570 // nmethod from the point we returned to the nmethod. So the return 2571 // address on the stack is wrong by NativeCall::instruction_size 2572 // We will adjust the value so it looks like we have the original return 2573 // address on the stack (like when we eagerly deoptimized). 2574 // In the case of an exception pending when deoptimizing, we enter 2575 // with a return address on the stack that points after the call we patched 2576 // into the exception handler. We have the following register state from, 2577 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp). 2578 // rax: exception oop 2579 // rbx: exception handler 2580 // rdx: throwing pc 2581 // So in this case we simply jam rdx into the useless return address and 2582 // the stack looks just like we want. 2583 // 2584 // At this point we need to de-opt. We save the argument return 2585 // registers. We call the first C routine, fetch_unroll_info(). This 2586 // routine captures the return values and returns a structure which 2587 // describes the current frame size and the sizes of all replacement frames. 2588 // The current frame is compiled code and may contain many inlined 2589 // functions, each with their own JVM state. We pop the current frame, then 2590 // push all the new frames. Then we call the C routine unpack_frames() to 2591 // populate these frames. Finally unpack_frames() returns us the new target 2592 // address. Notice that callee-save registers are BLOWN here; they have 2593 // already been captured in the vframeArray at the time the return PC was 2594 // patched. 2595 address start = __ pc(); 2596 Label cont; 2597 2598 // Prolog for non exception case! 2599 2600 // Save everything in sight. 2601 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2602 2603 // Normal deoptimization. Save exec mode for unpack_frames. 2604 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved 2605 __ jmp(cont); 2606 2607 int reexecute_offset = __ pc() - start; 2608 #if INCLUDE_JVMCI && !defined(COMPILER1) 2609 if (EnableJVMCI && UseJVMCICompiler) { 2610 // JVMCI does not use this kind of deoptimization 2611 __ should_not_reach_here(); 2612 } 2613 #endif 2614 2615 // Reexecute case 2616 // return address is the pc describes what bci to do re-execute at 2617 2618 // No need to update map as each call to save_live_registers will produce identical oopmap 2619 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2620 2621 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved 2622 __ jmp(cont); 2623 2624 #if INCLUDE_JVMCI 2625 Label after_fetch_unroll_info_call; 2626 int implicit_exception_uncommon_trap_offset = 0; 2627 int uncommon_trap_offset = 0; 2628 2629 if (EnableJVMCI) { 2630 implicit_exception_uncommon_trap_offset = __ pc() - start; 2631 2632 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset()))); 2633 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD); 2634 2635 uncommon_trap_offset = __ pc() - start; 2636 2637 // Save everything in sight. 2638 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2639 // fetch_unroll_info needs to call last_java_frame() 2640 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2641 2642 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset()))); 2643 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1); 2644 2645 __ movl(r14, Deoptimization::Unpack_reexecute); 2646 __ mov(c_rarg0, r15_thread); 2647 __ movl(c_rarg2, r14); // exec mode 2648 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2649 oop_maps->add_gc_map( __ pc()-start, map->deep_copy()); 2650 2651 __ reset_last_Java_frame(false); 2652 2653 __ jmp(after_fetch_unroll_info_call); 2654 } // EnableJVMCI 2655 #endif // INCLUDE_JVMCI 2656 2657 int exception_offset = __ pc() - start; 2658 2659 // Prolog for exception case 2660 2661 // all registers are dead at this entry point, except for rax, and 2662 // rdx which contain the exception oop and exception pc 2663 // respectively. Set them in TLS and fall thru to the 2664 // unpack_with_exception_in_tls entry point. 2665 2666 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 2667 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax); 2668 2669 int exception_in_tls_offset = __ pc() - start; 2670 2671 // new implementation because exception oop is now passed in JavaThread 2672 2673 // Prolog for exception case 2674 // All registers must be preserved because they might be used by LinearScan 2675 // Exceptiop oop and throwing PC are passed in JavaThread 2676 // tos: stack at point of call to method that threw the exception (i.e. only 2677 // args are on the stack, no return address) 2678 2679 // make room on stack for the return address 2680 // It will be patched later with the throwing pc. The correct value is not 2681 // available now because loading it from memory would destroy registers. 2682 __ push(0); 2683 2684 // Save everything in sight. 2685 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2686 2687 // Now it is safe to overwrite any register 2688 2689 // Deopt during an exception. Save exec mode for unpack_frames. 2690 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved 2691 2692 // load throwing pc from JavaThread and patch it as the return address 2693 // of the current frame. Then clear the field in JavaThread 2694 2695 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2696 __ movptr(Address(rbp, wordSize), rdx); 2697 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2698 2699 #ifdef ASSERT 2700 // verify that there is really an exception oop in JavaThread 2701 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2702 __ verify_oop(rax); 2703 2704 // verify that there is no pending exception 2705 Label no_pending_exception; 2706 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 2707 __ testptr(rax, rax); 2708 __ jcc(Assembler::zero, no_pending_exception); 2709 __ stop("must not have pending exception here"); 2710 __ bind(no_pending_exception); 2711 #endif 2712 2713 __ bind(cont); 2714 2715 // Call C code. Need thread and this frame, but NOT official VM entry 2716 // crud. We cannot block on this call, no GC can happen. 2717 // 2718 // UnrollBlock* fetch_unroll_info(JavaThread* thread) 2719 2720 // fetch_unroll_info needs to call last_java_frame(). 2721 2722 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2723 #ifdef ASSERT 2724 { Label L; 2725 __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD); 2726 __ jcc(Assembler::equal, L); 2727 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared"); 2728 __ bind(L); 2729 } 2730 #endif // ASSERT 2731 __ mov(c_rarg0, r15_thread); 2732 __ movl(c_rarg1, r14); // exec_mode 2733 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info))); 2734 2735 // Need to have an oopmap that tells fetch_unroll_info where to 2736 // find any register it might need. 2737 oop_maps->add_gc_map(__ pc() - start, map); 2738 2739 __ reset_last_Java_frame(false); 2740 2741 #if INCLUDE_JVMCI 2742 if (EnableJVMCI) { 2743 __ bind(after_fetch_unroll_info_call); 2744 } 2745 #endif 2746 2747 // Load UnrollBlock* into rdi 2748 __ mov(rdi, rax); 2749 2750 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset())); 2751 Label noException; 2752 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending? 2753 __ jcc(Assembler::notEqual, noException); 2754 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2755 // QQQ this is useless it was null above 2756 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2757 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD); 2758 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2759 2760 __ verify_oop(rax); 2761 2762 // Overwrite the result registers with the exception results. 2763 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2764 // I think this is useless 2765 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx); 2766 2767 __ bind(noException); 2768 2769 // Only register save data is on the stack. 2770 // Now restore the result registers. Everything else is either dead 2771 // or captured in the vframeArray. 2772 RegisterSaver::restore_result_registers(masm); 2773 2774 // All of the register save area has been popped of the stack. Only the 2775 // return address remains. 2776 2777 // Pop all the frames we must move/replace. 2778 // 2779 // Frame picture (youngest to oldest) 2780 // 1: self-frame (no frame link) 2781 // 2: deopting frame (no frame link) 2782 // 3: caller of deopting frame (could be compiled/interpreted). 2783 // 2784 // Note: by leaving the return address of self-frame on the stack 2785 // and using the size of frame 2 to adjust the stack 2786 // when we are done the return to frame 3 will still be on the stack. 2787 2788 // Pop deoptimized frame 2789 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset())); 2790 __ addptr(rsp, rcx); 2791 2792 // rsp should be pointing at the return address to the caller (3) 2793 2794 // Pick up the initial fp we should save 2795 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2796 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset())); 2797 2798 #ifdef ASSERT 2799 // Compilers generate code that bang the stack by as much as the 2800 // interpreter would need. So this stack banging should never 2801 // trigger a fault. Verify that it does not on non product builds. 2802 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset())); 2803 __ bang_stack_size(rbx, rcx); 2804 #endif 2805 2806 // Load address of array of frame pcs into rcx 2807 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset())); 2808 2809 // Trash the old pc 2810 __ addptr(rsp, wordSize); 2811 2812 // Load address of array of frame sizes into rsi 2813 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset())); 2814 2815 // Load counter into rdx 2816 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset())); 2817 2818 // Now adjust the caller's stack to make up for the extra locals 2819 // but record the original sp so that we can save it in the skeletal interpreter 2820 // frame and the stack walking of interpreter_sender will get the unextended sp 2821 // value and not the "real" sp value. 2822 2823 const Register sender_sp = r8; 2824 2825 __ mov(sender_sp, rsp); 2826 __ movl(rbx, Address(rdi, 2827 Deoptimization::UnrollBlock:: 2828 caller_adjustment_offset())); 2829 __ subptr(rsp, rbx); 2830 2831 // Push interpreter frames in a loop 2832 Label loop; 2833 __ bind(loop); 2834 __ movptr(rbx, Address(rsi, 0)); // Load frame size 2835 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand 2836 __ pushptr(Address(rcx, 0)); // Save return address 2837 __ enter(); // Save old & set new ebp 2838 __ subptr(rsp, rbx); // Prolog 2839 // This value is corrected by layout_activation_impl 2840 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD); 2841 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable 2842 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 2843 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 2844 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 2845 __ decrementl(rdx); // Decrement counter 2846 __ jcc(Assembler::notZero, loop); 2847 __ pushptr(Address(rcx, 0)); // Save final return address 2848 2849 // Re-push self-frame 2850 __ enter(); // Save old & set new ebp 2851 2852 // Allocate a full sized register save area. 2853 // Return address and rbp are in place, so we allocate two less words. 2854 __ subptr(rsp, (frame_size_in_words - 2) * wordSize); 2855 2856 // Restore frame locals after moving the frame 2857 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0); 2858 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2859 2860 // Call C code. Need thread but NOT official VM entry 2861 // crud. We cannot block on this call, no GC can happen. Call should 2862 // restore return values to their stack-slots with the new SP. 2863 // 2864 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode) 2865 2866 // Use rbp because the frames look interpreted now 2867 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 2868 // Don't need the precise return PC here, just precise enough to point into this code blob. 2869 address the_pc = __ pc(); 2870 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1); 2871 2872 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI 2873 __ mov(c_rarg0, r15_thread); 2874 __ movl(c_rarg1, r14); // second arg: exec_mode 2875 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 2876 // Revert SP alignment after call since we're going to do some SP relative addressing below 2877 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset())); 2878 2879 // Set an oopmap for the call site 2880 // Use the same PC we used for the last java frame 2881 oop_maps->add_gc_map(the_pc - start, 2882 new OopMap( frame_size_in_words, 0 )); 2883 2884 // Clear fp AND pc 2885 __ reset_last_Java_frame(true); 2886 2887 // Collect return values 2888 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes())); 2889 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes())); 2890 // I think this is useless (throwing pc?) 2891 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes())); 2892 2893 // Pop self-frame. 2894 __ leave(); // Epilog 2895 2896 // Jump to interpreter 2897 __ ret(0); 2898 2899 // Make sure all code is generated 2900 masm->flush(); 2901 2902 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words); 2903 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset); 2904 #if INCLUDE_JVMCI 2905 if (EnableJVMCI) { 2906 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset); 2907 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset); 2908 } 2909 #endif 2910 } 2911 2912 #ifdef COMPILER2 2913 //------------------------------generate_uncommon_trap_blob-------------------- 2914 void SharedRuntime::generate_uncommon_trap_blob() { 2915 // Allocate space for the code 2916 ResourceMark rm; 2917 // Setup code generation tools 2918 CodeBuffer buffer("uncommon_trap_blob", 2048, 1024); 2919 MacroAssembler* masm = new MacroAssembler(&buffer); 2920 2921 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 2922 2923 address start = __ pc(); 2924 2925 if (UseRTMLocking) { 2926 // Abort RTM transaction before possible nmethod deoptimization. 2927 __ xabort(0); 2928 } 2929 2930 // Push self-frame. We get here with a return address on the 2931 // stack, so rsp is 8-byte aligned until we allocate our frame. 2932 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog! 2933 2934 // No callee saved registers. rbp is assumed implicitly saved 2935 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 2936 2937 // compiler left unloaded_class_index in j_rarg0 move to where the 2938 // runtime expects it. 2939 __ movl(c_rarg1, j_rarg0); 2940 2941 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2942 2943 // Call C code. Need thread but NOT official VM entry 2944 // crud. We cannot block on this call, no GC can happen. Call should 2945 // capture callee-saved registers as well as return values. 2946 // Thread is in rdi already. 2947 // 2948 // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index); 2949 2950 __ mov(c_rarg0, r15_thread); 2951 __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap); 2952 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2953 2954 // Set an oopmap for the call site 2955 OopMapSet* oop_maps = new OopMapSet(); 2956 OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0); 2957 2958 // location of rbp is known implicitly by the frame sender code 2959 2960 oop_maps->add_gc_map(__ pc() - start, map); 2961 2962 __ reset_last_Java_frame(false); 2963 2964 // Load UnrollBlock* into rdi 2965 __ mov(rdi, rax); 2966 2967 #ifdef ASSERT 2968 { Label L; 2969 __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()), 2970 Deoptimization::Unpack_uncommon_trap); 2971 __ jcc(Assembler::equal, L); 2972 __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap"); 2973 __ bind(L); 2974 } 2975 #endif 2976 2977 // Pop all the frames we must move/replace. 2978 // 2979 // Frame picture (youngest to oldest) 2980 // 1: self-frame (no frame link) 2981 // 2: deopting frame (no frame link) 2982 // 3: caller of deopting frame (could be compiled/interpreted). 2983 2984 // Pop self-frame. We have no frame, and must rely only on rax and rsp. 2985 __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog! 2986 2987 // Pop deoptimized frame (int) 2988 __ movl(rcx, Address(rdi, 2989 Deoptimization::UnrollBlock:: 2990 size_of_deoptimized_frame_offset())); 2991 __ addptr(rsp, rcx); 2992 2993 // rsp should be pointing at the return address to the caller (3) 2994 2995 // Pick up the initial fp we should save 2996 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2997 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset())); 2998 2999 #ifdef ASSERT 3000 // Compilers generate code that bang the stack by as much as the 3001 // interpreter would need. So this stack banging should never 3002 // trigger a fault. Verify that it does not on non product builds. 3003 __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset())); 3004 __ bang_stack_size(rbx, rcx); 3005 #endif 3006 3007 // Load address of array of frame pcs into rcx (address*) 3008 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset())); 3009 3010 // Trash the return pc 3011 __ addptr(rsp, wordSize); 3012 3013 // Load address of array of frame sizes into rsi (intptr_t*) 3014 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset())); 3015 3016 // Counter 3017 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset())); // (int) 3018 3019 // Now adjust the caller's stack to make up for the extra locals but 3020 // record the original sp so that we can save it in the skeletal 3021 // interpreter frame and the stack walking of interpreter_sender 3022 // will get the unextended sp value and not the "real" sp value. 3023 3024 const Register sender_sp = r8; 3025 3026 __ mov(sender_sp, rsp); 3027 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset())); // (int) 3028 __ subptr(rsp, rbx); 3029 3030 // Push interpreter frames in a loop 3031 Label loop; 3032 __ bind(loop); 3033 __ movptr(rbx, Address(rsi, 0)); // Load frame size 3034 __ subptr(rbx, 2 * wordSize); // We'll push pc and rbp by hand 3035 __ pushptr(Address(rcx, 0)); // Save return address 3036 __ enter(); // Save old & set new rbp 3037 __ subptr(rsp, rbx); // Prolog 3038 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), 3039 sender_sp); // Make it walkable 3040 // This value is corrected by layout_activation_impl 3041 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD); 3042 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 3043 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 3044 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 3045 __ decrementl(rdx); // Decrement counter 3046 __ jcc(Assembler::notZero, loop); 3047 __ pushptr(Address(rcx, 0)); // Save final return address 3048 3049 // Re-push self-frame 3050 __ enter(); // Save old & set new rbp 3051 __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt); 3052 // Prolog 3053 3054 // Use rbp because the frames look interpreted now 3055 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 3056 // Don't need the precise return PC here, just precise enough to point into this code blob. 3057 address the_pc = __ pc(); 3058 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1); 3059 3060 // Call C code. Need thread but NOT official VM entry 3061 // crud. We cannot block on this call, no GC can happen. Call should 3062 // restore return values to their stack-slots with the new SP. 3063 // Thread is in rdi already. 3064 // 3065 // BasicType unpack_frames(JavaThread* thread, int exec_mode); 3066 3067 __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI 3068 __ mov(c_rarg0, r15_thread); 3069 __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap); 3070 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 3071 3072 // Set an oopmap for the call site 3073 // Use the same PC we used for the last java frame 3074 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 3075 3076 // Clear fp AND pc 3077 __ reset_last_Java_frame(true); 3078 3079 // Pop self-frame. 3080 __ leave(); // Epilog 3081 3082 // Jump to interpreter 3083 __ ret(0); 3084 3085 // Make sure all code is generated 3086 masm->flush(); 3087 3088 _uncommon_trap_blob = UncommonTrapBlob::create(&buffer, oop_maps, 3089 SimpleRuntimeFrame::framesize >> 1); 3090 } 3091 #endif // COMPILER2 3092 3093 //------------------------------generate_handler_blob------ 3094 // 3095 // Generate a special Compile2Runtime blob that saves all registers, 3096 // and setup oopmap. 3097 // 3098 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) { 3099 assert(StubRoutines::forward_exception_entry() != nullptr, 3100 "must be generated before"); 3101 3102 ResourceMark rm; 3103 OopMapSet *oop_maps = new OopMapSet(); 3104 OopMap* map; 3105 3106 // Allocate space for the code. Setup code generation tools. 3107 CodeBuffer buffer("handler_blob", 2048, 1024); 3108 MacroAssembler* masm = new MacroAssembler(&buffer); 3109 3110 address start = __ pc(); 3111 address call_pc = nullptr; 3112 int frame_size_in_words; 3113 bool cause_return = (poll_type == POLL_AT_RETURN); 3114 bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP); 3115 3116 if (UseRTMLocking) { 3117 // Abort RTM transaction before calling runtime 3118 // because critical section will be large and will be 3119 // aborted anyway. Also nmethod could be deoptimized. 3120 __ xabort(0); 3121 } 3122 3123 // Make room for return address (or push it again) 3124 if (!cause_return) { 3125 __ push(rbx); 3126 } 3127 3128 // Save registers, fpu state, and flags 3129 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors); 3130 3131 // The following is basically a call_VM. However, we need the precise 3132 // address of the call in order to generate an oopmap. Hence, we do all the 3133 // work ourselves. 3134 3135 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next: 3136 3137 // The return address must always be correct so that frame constructor never 3138 // sees an invalid pc. 3139 3140 if (!cause_return) { 3141 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack. 3142 // Additionally, rbx is a callee saved register and we can look at it later to determine 3143 // if someone changed the return address for us! 3144 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset())); 3145 __ movptr(Address(rbp, wordSize), rbx); 3146 } 3147 3148 // Do the call 3149 __ mov(c_rarg0, r15_thread); 3150 __ call(RuntimeAddress(call_ptr)); 3151 3152 // Set an oopmap for the call site. This oopmap will map all 3153 // oop-registers and debug-info registers as callee-saved. This 3154 // will allow deoptimization at this safepoint to find all possible 3155 // debug-info recordings, as well as let GC find all oops. 3156 3157 oop_maps->add_gc_map( __ pc() - start, map); 3158 3159 Label noException; 3160 3161 __ reset_last_Java_frame(false); 3162 3163 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3164 __ jcc(Assembler::equal, noException); 3165 3166 // Exception pending 3167 3168 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3169 3170 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3171 3172 // No exception case 3173 __ bind(noException); 3174 3175 Label no_adjust; 3176 #ifdef ASSERT 3177 Label bail; 3178 #endif 3179 if (!cause_return) { 3180 Label no_prefix, not_special; 3181 3182 // If our stashed return pc was modified by the runtime we avoid touching it 3183 __ cmpptr(rbx, Address(rbp, wordSize)); 3184 __ jccb(Assembler::notEqual, no_adjust); 3185 3186 // Skip over the poll instruction. 3187 // See NativeInstruction::is_safepoint_poll() 3188 // Possible encodings: 3189 // 85 00 test %eax,(%rax) 3190 // 85 01 test %eax,(%rcx) 3191 // 85 02 test %eax,(%rdx) 3192 // 85 03 test %eax,(%rbx) 3193 // 85 06 test %eax,(%rsi) 3194 // 85 07 test %eax,(%rdi) 3195 // 3196 // 41 85 00 test %eax,(%r8) 3197 // 41 85 01 test %eax,(%r9) 3198 // 41 85 02 test %eax,(%r10) 3199 // 41 85 03 test %eax,(%r11) 3200 // 41 85 06 test %eax,(%r14) 3201 // 41 85 07 test %eax,(%r15) 3202 // 3203 // 85 04 24 test %eax,(%rsp) 3204 // 41 85 04 24 test %eax,(%r12) 3205 // 85 45 00 test %eax,0x0(%rbp) 3206 // 41 85 45 00 test %eax,0x0(%r13) 3207 3208 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix); 3209 __ jcc(Assembler::notEqual, no_prefix); 3210 __ addptr(rbx, 1); 3211 __ bind(no_prefix); 3212 #ifdef ASSERT 3213 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below 3214 #endif 3215 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values: 3216 // r12/rsp 0x04 3217 // r13/rbp 0x05 3218 __ movzbq(rcx, Address(rbx, 1)); 3219 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05 3220 __ subptr(rcx, 4); // looking for 0x00 .. 0x01 3221 __ cmpptr(rcx, 1); 3222 __ jcc(Assembler::above, not_special); 3223 __ addptr(rbx, 1); 3224 __ bind(not_special); 3225 #ifdef ASSERT 3226 // Verify the correct encoding of the poll we're about to skip. 3227 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl); 3228 __ jcc(Assembler::notEqual, bail); 3229 // Mask out the modrm bits 3230 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask); 3231 // rax encodes to 0, so if the bits are nonzero it's incorrect 3232 __ jcc(Assembler::notZero, bail); 3233 #endif 3234 // Adjust return pc forward to step over the safepoint poll instruction 3235 __ addptr(rbx, 2); 3236 __ movptr(Address(rbp, wordSize), rbx); 3237 } 3238 3239 __ bind(no_adjust); 3240 // Normal exit, restore registers and exit. 3241 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3242 __ ret(0); 3243 3244 #ifdef ASSERT 3245 __ bind(bail); 3246 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected"); 3247 #endif 3248 3249 // Make sure all code is generated 3250 masm->flush(); 3251 3252 // Fill-out other meta info 3253 return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words); 3254 } 3255 3256 // 3257 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss 3258 // 3259 // Generate a stub that calls into vm to find out the proper destination 3260 // of a java call. All the argument registers are live at this point 3261 // but since this is generic code we don't know what they are and the caller 3262 // must do any gc of the args. 3263 // 3264 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) { 3265 assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before"); 3266 3267 // allocate space for the code 3268 ResourceMark rm; 3269 3270 CodeBuffer buffer(name, 1200, 512); 3271 MacroAssembler* masm = new MacroAssembler(&buffer); 3272 3273 int frame_size_in_words; 3274 3275 OopMapSet *oop_maps = new OopMapSet(); 3276 OopMap* map = nullptr; 3277 3278 int start = __ offset(); 3279 3280 // No need to save vector registers since they are caller-saved anyway. 3281 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false); 3282 3283 int frame_complete = __ offset(); 3284 3285 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 3286 3287 __ mov(c_rarg0, r15_thread); 3288 3289 __ call(RuntimeAddress(destination)); 3290 3291 3292 // Set an oopmap for the call site. 3293 // We need this not only for callee-saved registers, but also for volatile 3294 // registers that the compiler might be keeping live across a safepoint. 3295 3296 oop_maps->add_gc_map( __ offset() - start, map); 3297 3298 // rax contains the address we are going to jump to assuming no exception got installed 3299 3300 // clear last_Java_sp 3301 __ reset_last_Java_frame(false); 3302 // check for pending exceptions 3303 Label pending; 3304 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3305 __ jcc(Assembler::notEqual, pending); 3306 3307 // get the returned Method* 3308 __ get_vm_result_2(rbx, r15_thread); 3309 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx); 3310 3311 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 3312 3313 RegisterSaver::restore_live_registers(masm); 3314 3315 // We are back to the original state on entry and ready to go. 3316 3317 __ jmp(rax); 3318 3319 // Pending exception after the safepoint 3320 3321 __ bind(pending); 3322 3323 RegisterSaver::restore_live_registers(masm); 3324 3325 // exception pending => remove activation and forward to exception handler 3326 3327 __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD); 3328 3329 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 3330 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3331 3332 // ------------- 3333 // make sure all code is generated 3334 masm->flush(); 3335 3336 // return the blob 3337 // frame_size_words or bytes?? 3338 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); 3339 } 3340 3341 //------------------------------Montgomery multiplication------------------------ 3342 // 3343 3344 #ifndef _WINDOWS 3345 3346 // Subtract 0:b from carry:a. Return carry. 3347 static julong 3348 sub(julong a[], julong b[], julong carry, long len) { 3349 long long i = 0, cnt = len; 3350 julong tmp; 3351 asm volatile("clc; " 3352 "0: ; " 3353 "mov (%[b], %[i], 8), %[tmp]; " 3354 "sbb %[tmp], (%[a], %[i], 8); " 3355 "inc %[i]; dec %[cnt]; " 3356 "jne 0b; " 3357 "mov %[carry], %[tmp]; sbb $0, %[tmp]; " 3358 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp) 3359 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry) 3360 : "memory"); 3361 return tmp; 3362 } 3363 3364 // Multiply (unsigned) Long A by Long B, accumulating the double- 3365 // length result into the accumulator formed of T0, T1, and T2. 3366 #define MACC(A, B, T0, T1, T2) \ 3367 do { \ 3368 unsigned long hi, lo; \ 3369 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3370 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3371 : "r"(A), "a"(B) : "cc"); \ 3372 } while(0) 3373 3374 // As above, but add twice the double-length result into the 3375 // accumulator. 3376 #define MACC2(A, B, T0, T1, T2) \ 3377 do { \ 3378 unsigned long hi, lo; \ 3379 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \ 3380 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3381 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3382 : "r"(A), "a"(B) : "cc"); \ 3383 } while(0) 3384 3385 #else //_WINDOWS 3386 3387 static julong 3388 sub(julong a[], julong b[], julong carry, long len) { 3389 long i; 3390 julong tmp; 3391 unsigned char c = 1; 3392 for (i = 0; i < len; i++) { 3393 c = _addcarry_u64(c, a[i], ~b[i], &tmp); 3394 a[i] = tmp; 3395 } 3396 c = _addcarry_u64(c, carry, ~0, &tmp); 3397 return tmp; 3398 } 3399 3400 // Multiply (unsigned) Long A by Long B, accumulating the double- 3401 // length result into the accumulator formed of T0, T1, and T2. 3402 #define MACC(A, B, T0, T1, T2) \ 3403 do { \ 3404 julong hi, lo; \ 3405 lo = _umul128(A, B, &hi); \ 3406 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3407 c = _addcarry_u64(c, hi, T1, &T1); \ 3408 _addcarry_u64(c, T2, 0, &T2); \ 3409 } while(0) 3410 3411 // As above, but add twice the double-length result into the 3412 // accumulator. 3413 #define MACC2(A, B, T0, T1, T2) \ 3414 do { \ 3415 julong hi, lo; \ 3416 lo = _umul128(A, B, &hi); \ 3417 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3418 c = _addcarry_u64(c, hi, T1, &T1); \ 3419 _addcarry_u64(c, T2, 0, &T2); \ 3420 c = _addcarry_u64(0, lo, T0, &T0); \ 3421 c = _addcarry_u64(c, hi, T1, &T1); \ 3422 _addcarry_u64(c, T2, 0, &T2); \ 3423 } while(0) 3424 3425 #endif //_WINDOWS 3426 3427 // Fast Montgomery multiplication. The derivation of the algorithm is 3428 // in A Cryptographic Library for the Motorola DSP56000, 3429 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3430 3431 static void NOINLINE 3432 montgomery_multiply(julong a[], julong b[], julong n[], 3433 julong m[], julong inv, int len) { 3434 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3435 int i; 3436 3437 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply"); 3438 3439 for (i = 0; i < len; i++) { 3440 int j; 3441 for (j = 0; j < i; j++) { 3442 MACC(a[j], b[i-j], t0, t1, t2); 3443 MACC(m[j], n[i-j], t0, t1, t2); 3444 } 3445 MACC(a[i], b[0], t0, t1, t2); 3446 m[i] = t0 * inv; 3447 MACC(m[i], n[0], t0, t1, t2); 3448 3449 assert(t0 == 0, "broken Montgomery multiply"); 3450 3451 t0 = t1; t1 = t2; t2 = 0; 3452 } 3453 3454 for (i = len; i < 2*len; i++) { 3455 int j; 3456 for (j = i-len+1; j < len; j++) { 3457 MACC(a[j], b[i-j], t0, t1, t2); 3458 MACC(m[j], n[i-j], t0, t1, t2); 3459 } 3460 m[i-len] = t0; 3461 t0 = t1; t1 = t2; t2 = 0; 3462 } 3463 3464 while (t0) 3465 t0 = sub(m, n, t0, len); 3466 } 3467 3468 // Fast Montgomery squaring. This uses asymptotically 25% fewer 3469 // multiplies so it should be up to 25% faster than Montgomery 3470 // multiplication. However, its loop control is more complex and it 3471 // may actually run slower on some machines. 3472 3473 static void NOINLINE 3474 montgomery_square(julong a[], julong n[], 3475 julong m[], julong inv, int len) { 3476 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3477 int i; 3478 3479 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square"); 3480 3481 for (i = 0; i < len; i++) { 3482 int j; 3483 int end = (i+1)/2; 3484 for (j = 0; j < end; j++) { 3485 MACC2(a[j], a[i-j], t0, t1, t2); 3486 MACC(m[j], n[i-j], t0, t1, t2); 3487 } 3488 if ((i & 1) == 0) { 3489 MACC(a[j], a[j], t0, t1, t2); 3490 } 3491 for (; j < i; j++) { 3492 MACC(m[j], n[i-j], t0, t1, t2); 3493 } 3494 m[i] = t0 * inv; 3495 MACC(m[i], n[0], t0, t1, t2); 3496 3497 assert(t0 == 0, "broken Montgomery square"); 3498 3499 t0 = t1; t1 = t2; t2 = 0; 3500 } 3501 3502 for (i = len; i < 2*len; i++) { 3503 int start = i-len+1; 3504 int end = start + (len - start)/2; 3505 int j; 3506 for (j = start; j < end; j++) { 3507 MACC2(a[j], a[i-j], t0, t1, t2); 3508 MACC(m[j], n[i-j], t0, t1, t2); 3509 } 3510 if ((i & 1) == 0) { 3511 MACC(a[j], a[j], t0, t1, t2); 3512 } 3513 for (; j < len; j++) { 3514 MACC(m[j], n[i-j], t0, t1, t2); 3515 } 3516 m[i-len] = t0; 3517 t0 = t1; t1 = t2; t2 = 0; 3518 } 3519 3520 while (t0) 3521 t0 = sub(m, n, t0, len); 3522 } 3523 3524 // Swap words in a longword. 3525 static julong swap(julong x) { 3526 return (x << 32) | (x >> 32); 3527 } 3528 3529 // Copy len longwords from s to d, word-swapping as we go. The 3530 // destination array is reversed. 3531 static void reverse_words(julong *s, julong *d, int len) { 3532 d += len; 3533 while(len-- > 0) { 3534 d--; 3535 *d = swap(*s); 3536 s++; 3537 } 3538 } 3539 3540 // The threshold at which squaring is advantageous was determined 3541 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. 3542 #define MONTGOMERY_SQUARING_THRESHOLD 64 3543 3544 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, 3545 jint len, jlong inv, 3546 jint *m_ints) { 3547 assert(len % 2 == 0, "array length in montgomery_multiply must be even"); 3548 int longwords = len/2; 3549 3550 // Make very sure we don't use so much space that the stack might 3551 // overflow. 512 jints corresponds to an 16384-bit integer and 3552 // will use here a total of 8k bytes of stack space. 3553 int divisor = sizeof(julong) * 4; 3554 guarantee(longwords <= 8192 / divisor, "must be"); 3555 int total_allocation = longwords * sizeof (julong) * 4; 3556 julong *scratch = (julong *)alloca(total_allocation); 3557 3558 // Local scratch arrays 3559 julong 3560 *a = scratch + 0 * longwords, 3561 *b = scratch + 1 * longwords, 3562 *n = scratch + 2 * longwords, 3563 *m = scratch + 3 * longwords; 3564 3565 reverse_words((julong *)a_ints, a, longwords); 3566 reverse_words((julong *)b_ints, b, longwords); 3567 reverse_words((julong *)n_ints, n, longwords); 3568 3569 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords); 3570 3571 reverse_words(m, (julong *)m_ints, longwords); 3572 } 3573 3574 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, 3575 jint len, jlong inv, 3576 jint *m_ints) { 3577 assert(len % 2 == 0, "array length in montgomery_square must be even"); 3578 int longwords = len/2; 3579 3580 // Make very sure we don't use so much space that the stack might 3581 // overflow. 512 jints corresponds to an 16384-bit integer and 3582 // will use here a total of 6k bytes of stack space. 3583 int divisor = sizeof(julong) * 3; 3584 guarantee(longwords <= (8192 / divisor), "must be"); 3585 int total_allocation = longwords * sizeof (julong) * 3; 3586 julong *scratch = (julong *)alloca(total_allocation); 3587 3588 // Local scratch arrays 3589 julong 3590 *a = scratch + 0 * longwords, 3591 *n = scratch + 1 * longwords, 3592 *m = scratch + 2 * longwords; 3593 3594 reverse_words((julong *)a_ints, a, longwords); 3595 reverse_words((julong *)n_ints, n, longwords); 3596 3597 if (len >= MONTGOMERY_SQUARING_THRESHOLD) { 3598 ::montgomery_square(a, n, m, (julong)inv, longwords); 3599 } else { 3600 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords); 3601 } 3602 3603 reverse_words(m, (julong *)m_ints, longwords); 3604 } 3605 3606 #ifdef COMPILER2 3607 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame 3608 // 3609 //------------------------------generate_exception_blob--------------------------- 3610 // creates exception blob at the end 3611 // Using exception blob, this code is jumped from a compiled method. 3612 // (see emit_exception_handler in x86_64.ad file) 3613 // 3614 // Given an exception pc at a call we call into the runtime for the 3615 // handler in this method. This handler might merely restore state 3616 // (i.e. callee save registers) unwind the frame and jump to the 3617 // exception handler for the nmethod if there is no Java level handler 3618 // for the nmethod. 3619 // 3620 // This code is entered with a jmp. 3621 // 3622 // Arguments: 3623 // rax: exception oop 3624 // rdx: exception pc 3625 // 3626 // Results: 3627 // rax: exception oop 3628 // rdx: exception pc in caller or ??? 3629 // destination: exception handler of caller 3630 // 3631 // Note: the exception pc MUST be at a call (precise debug information) 3632 // Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved. 3633 // 3634 3635 void OptoRuntime::generate_exception_blob() { 3636 assert(!OptoRuntime::is_callee_saved_register(RDX_num), ""); 3637 assert(!OptoRuntime::is_callee_saved_register(RAX_num), ""); 3638 assert(!OptoRuntime::is_callee_saved_register(RCX_num), ""); 3639 3640 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 3641 3642 // Allocate space for the code 3643 ResourceMark rm; 3644 // Setup code generation tools 3645 CodeBuffer buffer("exception_blob", 2048, 1024); 3646 int pc_offset = 0; 3647 if (SCCache::load_exception_blob(&buffer, &pc_offset)) { 3648 OopMapSet* oop_maps = new OopMapSet(); 3649 oop_maps->add_gc_map(pc_offset, new OopMap(SimpleRuntimeFrame::framesize, 0)); 3650 3651 // Set exception blob 3652 _exception_blob = ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1); 3653 return; 3654 } 3655 3656 MacroAssembler* masm = new MacroAssembler(&buffer); 3657 address start = __ pc(); 3658 3659 // Exception pc is 'return address' for stack walker 3660 __ push(rdx); 3661 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog 3662 3663 // Save callee-saved registers. See x86_64.ad. 3664 3665 // rbp is an implicitly saved callee saved register (i.e., the calling 3666 // convention will save/restore it in the prolog/epilog). Other than that 3667 // there are no callee save registers now that adapter frames are gone. 3668 3669 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 3670 3671 // Store exception in Thread object. We cannot pass any arguments to the 3672 // handle_exception call, since we do not want to make any assumption 3673 // about the size of the frame where the exception happened in. 3674 // c_rarg0 is either rdi (Linux) or rcx (Windows). 3675 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax); 3676 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 3677 3678 // This call does all the hard work. It checks if an exception handler 3679 // exists in the method. 3680 // If so, it returns the handler address. 3681 // If not, it prepares for stack-unwinding, restoring the callee-save 3682 // registers of the frame being removed. 3683 // 3684 // address OptoRuntime::handle_exception_C(JavaThread* thread) 3685 3686 // At a method handle call, the stack may not be properly aligned 3687 // when returning with an exception. 3688 address the_pc = __ pc(); 3689 __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1); 3690 __ mov(c_rarg0, r15_thread); 3691 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack 3692 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C))); 3693 3694 // Set an oopmap for the call site. This oopmap will only be used if we 3695 // are unwinding the stack. Hence, all locations will be dead. 3696 // Callee-saved registers will be the same as the frame above (i.e., 3697 // handle_exception_stub), since they were restored when we got the 3698 // exception. 3699 3700 OopMapSet* oop_maps = new OopMapSet(); 3701 3702 pc_offset = the_pc - start; 3703 oop_maps->add_gc_map(pc_offset, new OopMap(SimpleRuntimeFrame::framesize, 0)); 3704 3705 __ reset_last_Java_frame(false); 3706 3707 // Restore callee-saved registers 3708 3709 // rbp is an implicitly saved callee-saved register (i.e., the calling 3710 // convention will save restore it in prolog/epilog) Other than that 3711 // there are no callee save registers now that adapter frames are gone. 3712 3713 __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt)); 3714 3715 __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog 3716 __ pop(rdx); // No need for exception pc anymore 3717 3718 // rax: exception handler 3719 3720 // We have a handler in rax (could be deopt blob). 3721 __ mov(r8, rax); 3722 3723 // Get the exception oop 3724 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 3725 // Get the exception pc in case we are deoptimized 3726 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 3727 #ifdef ASSERT 3728 __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD); 3729 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 3730 #endif 3731 // Clear the exception oop so GC no longer processes it as a root. 3732 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD); 3733 3734 // rax: exception oop 3735 // r8: exception handler 3736 // rdx: exception pc 3737 // Jump to handler 3738 3739 __ jmp(r8); 3740 3741 // Make sure all code is generated 3742 masm->flush(); 3743 3744 SCCache::store_exception_blob(&buffer, pc_offset); 3745 // Set exception blob 3746 _exception_blob = ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1); 3747 } 3748 #endif // COMPILER2