1 /* 2 * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #ifndef _WINDOWS 27 #include "alloca.h" 28 #endif 29 #include "asm/macroAssembler.hpp" 30 #include "asm/macroAssembler.inline.hpp" 31 #include "code/debugInfoRec.hpp" 32 #include "code/icBuffer.hpp" 33 #include "code/nativeInst.hpp" 34 #include "code/vtableStubs.hpp" 35 #include "compiler/oopMap.hpp" 36 #include "gc/shared/collectedHeap.hpp" 37 #include "gc/shared/gcLocker.hpp" 38 #include "gc/shared/barrierSet.hpp" 39 #include "gc/shared/barrierSetAssembler.hpp" 40 #include "interpreter/interpreter.hpp" 41 #include "logging/log.hpp" 42 #include "memory/resourceArea.hpp" 43 #include "memory/universe.hpp" 44 #include "oops/compiledICHolder.hpp" 45 #include "oops/klass.inline.hpp" 46 #include "prims/methodHandles.hpp" 47 #include "runtime/jniHandles.hpp" 48 #include "runtime/safepointMechanism.hpp" 49 #include "runtime/sharedRuntime.hpp" 50 #include "runtime/signature.hpp" 51 #include "runtime/stubRoutines.hpp" 52 #include "runtime/vframeArray.hpp" 53 #include "runtime/vm_version.hpp" 54 #include "utilities/align.hpp" 55 #include "utilities/formatBuffer.hpp" 56 #include "vmreg_x86.inline.hpp" 57 #ifdef COMPILER1 58 #include "c1/c1_Runtime1.hpp" 59 #endif 60 #ifdef COMPILER2 61 #include "opto/runtime.hpp" 62 #endif 63 #if INCLUDE_JVMCI 64 #include "jvmci/jvmciJavaClasses.hpp" 65 #endif 66 67 #define __ masm-> 68 69 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size; 70 71 class SimpleRuntimeFrame { 72 73 public: 74 75 // Most of the runtime stubs have this simple frame layout. 76 // This class exists to make the layout shared in one place. 77 // Offsets are for compiler stack slots, which are jints. 78 enum layout { 79 // The frame sender code expects that rbp will be in the "natural" place and 80 // will override any oopMap setting for it. We must therefore force the layout 81 // so that it agrees with the frame sender code. 82 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt, 83 rbp_off2, 84 return_off, return_off2, 85 framesize 86 }; 87 }; 88 89 class RegisterSaver { 90 // Capture info about frame layout. Layout offsets are in jint 91 // units because compiler frame slots are jints. 92 #define XSAVE_AREA_BEGIN 160 93 #define XSAVE_AREA_YMM_BEGIN 576 94 #define XSAVE_AREA_OPMASK_BEGIN 1088 95 #define XSAVE_AREA_ZMM_BEGIN 1152 96 #define XSAVE_AREA_UPPERBANK 1664 97 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off 98 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off 99 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off 100 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off 101 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off 102 enum layout { 103 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area 104 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area 105 DEF_XMM_OFFS(0), 106 DEF_XMM_OFFS(1), 107 // 2..15 are implied in range usage 108 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 109 DEF_YMM_OFFS(0), 110 DEF_YMM_OFFS(1), 111 // 2..15 are implied in range usage 112 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 113 DEF_OPMASK_OFFS(0), 114 DEF_OPMASK_OFFS(1), 115 // 2..7 are implied in range usage 116 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 117 DEF_ZMM_OFFS(0), 118 DEF_ZMM_OFFS(1), 119 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt, 120 DEF_ZMM_UPPER_OFFS(16), 121 DEF_ZMM_UPPER_OFFS(17), 122 // 18..31 are implied in range usage 123 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt), 124 fpu_stateH_end, 125 r15_off, r15H_off, 126 r14_off, r14H_off, 127 r13_off, r13H_off, 128 r12_off, r12H_off, 129 r11_off, r11H_off, 130 r10_off, r10H_off, 131 r9_off, r9H_off, 132 r8_off, r8H_off, 133 rdi_off, rdiH_off, 134 rsi_off, rsiH_off, 135 ignore_off, ignoreH_off, // extra copy of rbp 136 rsp_off, rspH_off, 137 rbx_off, rbxH_off, 138 rdx_off, rdxH_off, 139 rcx_off, rcxH_off, 140 rax_off, raxH_off, 141 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state 142 align_off, alignH_off, 143 flags_off, flagsH_off, 144 // The frame sender code expects that rbp will be in the "natural" place and 145 // will override any oopMap setting for it. We must therefore force the layout 146 // so that it agrees with the frame sender code. 147 rbp_off, rbpH_off, // copy of rbp we will restore 148 return_off, returnH_off, // slot for return address 149 reg_save_size // size in compiler stack slots 150 }; 151 152 public: 153 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors); 154 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false); 155 156 // Offsets into the register save area 157 // Used by deoptimization when it is managing result register 158 // values on its own 159 160 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; } 161 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; } 162 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; } 163 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; } 164 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; } 165 166 // During deoptimization only the result registers need to be restored, 167 // all the other values have already been extracted. 168 static void restore_result_registers(MacroAssembler* masm); 169 }; 170 171 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) { 172 int off = 0; 173 int num_xmm_regs = XMMRegisterImpl::number_of_registers; 174 if (UseAVX < 3) { 175 num_xmm_regs = num_xmm_regs/2; 176 } 177 #if COMPILER2_OR_JVMCI 178 if (save_wide_vectors && UseAVX == 0) { 179 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX 180 } 181 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 182 #else 183 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI 184 #endif 185 186 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated 187 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs); 188 // OopMap frame size is in compiler stack slots (jint's) not bytes or words 189 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 190 // CodeBlob frame size is in words. 191 int frame_size_in_words = frame_size_in_bytes / wordSize; 192 *total_frame_words = frame_size_in_words; 193 194 // Save registers, fpu state, and flags. 195 // We assume caller has already pushed the return address onto the 196 // stack, so rsp is 8-byte aligned here. 197 // We push rpb twice in this sequence because we want the real rbp 198 // to be under the return like a normal enter. 199 200 __ enter(); // rsp becomes 16-byte aligned here 201 __ push_CPU_state(); // Push a multiple of 16 bytes 202 203 // push cpu state handles this on EVEX enabled targets 204 if (save_wide_vectors) { 205 // Save upper half of YMM registers(0..15) 206 int base_addr = XSAVE_AREA_YMM_BEGIN; 207 for (int n = 0; n < 16; n++) { 208 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n)); 209 } 210 if (VM_Version::supports_evex()) { 211 // Save upper half of ZMM registers(0..15) 212 base_addr = XSAVE_AREA_ZMM_BEGIN; 213 for (int n = 0; n < 16; n++) { 214 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n)); 215 } 216 // Save full ZMM registers(16..num_xmm_regs) 217 base_addr = XSAVE_AREA_UPPERBANK; 218 off = 0; 219 int vector_len = Assembler::AVX_512bit; 220 for (int n = 16; n < num_xmm_regs; n++) { 221 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 222 } 223 #if COMPILER2_OR_JVMCI 224 base_addr = XSAVE_AREA_OPMASK_BEGIN; 225 off = 0; 226 for(int n = 0; n < KRegisterImpl::number_of_registers; n++) { 227 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 228 } 229 #endif 230 } 231 } else { 232 if (VM_Version::supports_evex()) { 233 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 234 int base_addr = XSAVE_AREA_UPPERBANK; 235 off = 0; 236 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 237 for (int n = 16; n < num_xmm_regs; n++) { 238 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 239 } 240 #if COMPILER2_OR_JVMCI 241 base_addr = XSAVE_AREA_OPMASK_BEGIN; 242 off = 0; 243 for(int n = 0; n < KRegisterImpl::number_of_registers; n++) { 244 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 245 } 246 #endif 247 } 248 } 249 __ vzeroupper(); 250 if (frame::arg_reg_save_area_bytes != 0) { 251 // Allocate argument register save area 252 __ subptr(rsp, frame::arg_reg_save_area_bytes); 253 } 254 255 // Set an oopmap for the call site. This oopmap will map all 256 // oop-registers and debug-info registers as callee-saved. This 257 // will allow deoptimization at this safepoint to find all possible 258 // debug-info recordings, as well as let GC find all oops. 259 260 OopMapSet *oop_maps = new OopMapSet(); 261 OopMap* map = new OopMap(frame_size_in_slots, 0); 262 263 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x)) 264 265 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg()); 266 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg()); 267 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg()); 268 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg()); 269 // rbp location is known implicitly by the frame sender code, needs no oopmap 270 // and the location where rbp was saved by is ignored 271 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg()); 272 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg()); 273 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg()); 274 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg()); 275 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg()); 276 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg()); 277 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg()); 278 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg()); 279 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg()); 280 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg()); 281 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 282 // on EVEX enabled targets, we get it included in the xsave area 283 off = xmm0_off; 284 int delta = xmm1_off - off; 285 for (int n = 0; n < 16; n++) { 286 XMMRegister xmm_name = as_XMMRegister(n); 287 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()); 288 off += delta; 289 } 290 if (UseAVX > 2) { 291 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 292 off = zmm16_off; 293 delta = zmm17_off - off; 294 for (int n = 16; n < num_xmm_regs; n++) { 295 XMMRegister zmm_name = as_XMMRegister(n); 296 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()); 297 off += delta; 298 } 299 } 300 301 #if COMPILER2_OR_JVMCI 302 if (save_wide_vectors) { 303 // Save upper half of YMM registers(0..15) 304 off = ymm0_off; 305 delta = ymm1_off - ymm0_off; 306 for (int n = 0; n < 16; n++) { 307 XMMRegister ymm_name = as_XMMRegister(n); 308 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4)); 309 off += delta; 310 } 311 if (VM_Version::supports_evex()) { 312 // Save upper half of ZMM registers(0..15) 313 off = zmm0_off; 314 delta = zmm1_off - zmm0_off; 315 for (int n = 0; n < 16; n++) { 316 XMMRegister zmm_name = as_XMMRegister(n); 317 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8)); 318 off += delta; 319 } 320 } 321 } 322 #endif // COMPILER2_OR_JVMCI 323 324 // %%% These should all be a waste but we'll keep things as they were for now 325 if (true) { 326 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next()); 327 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next()); 328 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next()); 329 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next()); 330 // rbp location is known implicitly by the frame sender code, needs no oopmap 331 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next()); 332 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next()); 333 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next()); 334 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next()); 335 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next()); 336 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next()); 337 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next()); 338 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next()); 339 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next()); 340 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next()); 341 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 342 // on EVEX enabled targets, we get it included in the xsave area 343 off = xmm0H_off; 344 delta = xmm1H_off - off; 345 for (int n = 0; n < 16; n++) { 346 XMMRegister xmm_name = as_XMMRegister(n); 347 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next()); 348 off += delta; 349 } 350 if (UseAVX > 2) { 351 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 352 off = zmm16H_off; 353 delta = zmm17H_off - off; 354 for (int n = 16; n < num_xmm_regs; n++) { 355 XMMRegister zmm_name = as_XMMRegister(n); 356 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next()); 357 off += delta; 358 } 359 } 360 } 361 362 return map; 363 } 364 365 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) { 366 int num_xmm_regs = XMMRegisterImpl::number_of_registers; 367 if (UseAVX < 3) { 368 num_xmm_regs = num_xmm_regs/2; 369 } 370 if (frame::arg_reg_save_area_bytes != 0) { 371 // Pop arg register save area 372 __ addptr(rsp, frame::arg_reg_save_area_bytes); 373 } 374 375 #if COMPILER2_OR_JVMCI 376 if (restore_wide_vectors) { 377 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); 378 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 379 } 380 #else 381 assert(!restore_wide_vectors, "vectors are generated only by C2"); 382 #endif 383 384 __ vzeroupper(); 385 386 // On EVEX enabled targets everything is handled in pop fpu state 387 if (restore_wide_vectors) { 388 // Restore upper half of YMM registers (0..15) 389 int base_addr = XSAVE_AREA_YMM_BEGIN; 390 for (int n = 0; n < 16; n++) { 391 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16)); 392 } 393 if (VM_Version::supports_evex()) { 394 // Restore upper half of ZMM registers (0..15) 395 base_addr = XSAVE_AREA_ZMM_BEGIN; 396 for (int n = 0; n < 16; n++) { 397 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32)); 398 } 399 // Restore full ZMM registers(16..num_xmm_regs) 400 base_addr = XSAVE_AREA_UPPERBANK; 401 int vector_len = Assembler::AVX_512bit; 402 int off = 0; 403 for (int n = 16; n < num_xmm_regs; n++) { 404 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 405 } 406 #if COMPILER2_OR_JVMCI 407 base_addr = XSAVE_AREA_OPMASK_BEGIN; 408 off = 0; 409 for (int n = 0; n < KRegisterImpl::number_of_registers; n++) { 410 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 411 } 412 #endif 413 } 414 } else { 415 if (VM_Version::supports_evex()) { 416 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 417 int base_addr = XSAVE_AREA_UPPERBANK; 418 int off = 0; 419 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 420 for (int n = 16; n < num_xmm_regs; n++) { 421 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 422 } 423 #if COMPILER2_OR_JVMCI 424 base_addr = XSAVE_AREA_OPMASK_BEGIN; 425 off = 0; 426 for (int n = 0; n < KRegisterImpl::number_of_registers; n++) { 427 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 428 } 429 #endif 430 } 431 } 432 433 // Recover CPU state 434 __ pop_CPU_state(); 435 // Get the rbp described implicitly by the calling convention (no oopMap) 436 __ pop(rbp); 437 } 438 439 void RegisterSaver::restore_result_registers(MacroAssembler* masm) { 440 441 // Just restore result register. Only used by deoptimization. By 442 // now any callee save register that needs to be restored to a c2 443 // caller of the deoptee has been extracted into the vframeArray 444 // and will be stuffed into the c2i adapter we create for later 445 // restoration so only result registers need to be restored here. 446 447 // Restore fp result register 448 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes())); 449 // Restore integer result register 450 __ movptr(rax, Address(rsp, rax_offset_in_bytes())); 451 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes())); 452 453 // Pop all of the register save are off the stack except the return address 454 __ addptr(rsp, return_offset_in_bytes()); 455 } 456 457 // Is vector's size (in bytes) bigger than a size saved by default? 458 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions. 459 bool SharedRuntime::is_wide_vector(int size) { 460 return size > 16; 461 } 462 463 // --------------------------------------------------------------------------- 464 // Read the array of BasicTypes from a signature, and compute where the 465 // arguments should go. Values in the VMRegPair regs array refer to 4-byte 466 // quantities. Values less than VMRegImpl::stack0 are registers, those above 467 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer 468 // as framesizes are fixed. 469 // VMRegImpl::stack0 refers to the first slot 0(sp). 470 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher. Register 471 // up to RegisterImpl::number_of_registers) are the 64-bit 472 // integer registers. 473 474 // Note: the INPUTS in sig_bt are in units of Java argument words, which are 475 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit 476 // units regardless of build. Of course for i486 there is no 64 bit build 477 478 // The Java calling convention is a "shifted" version of the C ABI. 479 // By skipping the first C ABI register we can call non-static jni methods 480 // with small numbers of arguments without having to shuffle the arguments 481 // at all. Since we control the java ABI we ought to at least get some 482 // advantage out of it. 483 484 int SharedRuntime::java_calling_convention(const BasicType *sig_bt, 485 VMRegPair *regs, 486 int total_args_passed) { 487 488 // Create the mapping between argument positions and 489 // registers. 490 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = { 491 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5 492 }; 493 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = { 494 j_farg0, j_farg1, j_farg2, j_farg3, 495 j_farg4, j_farg5, j_farg6, j_farg7 496 }; 497 498 499 uint int_args = 0; 500 uint fp_args = 0; 501 uint stk_args = 0; // inc by 2 each time 502 503 for (int i = 0; i < total_args_passed; i++) { 504 switch (sig_bt[i]) { 505 case T_BOOLEAN: 506 case T_CHAR: 507 case T_BYTE: 508 case T_SHORT: 509 case T_INT: 510 if (int_args < Argument::n_int_register_parameters_j) { 511 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 512 } else { 513 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 514 stk_args += 2; 515 } 516 break; 517 case T_VOID: 518 // halves of T_LONG or T_DOUBLE 519 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 520 regs[i].set_bad(); 521 break; 522 case T_LONG: 523 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 524 // fall through 525 case T_OBJECT: 526 case T_ARRAY: 527 case T_ADDRESS: 528 if (int_args < Argument::n_int_register_parameters_j) { 529 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 530 } else { 531 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 532 stk_args += 2; 533 } 534 break; 535 case T_FLOAT: 536 if (fp_args < Argument::n_float_register_parameters_j) { 537 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 538 } else { 539 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 540 stk_args += 2; 541 } 542 break; 543 case T_DOUBLE: 544 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 545 if (fp_args < Argument::n_float_register_parameters_j) { 546 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 547 } else { 548 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 549 stk_args += 2; 550 } 551 break; 552 default: 553 ShouldNotReachHere(); 554 break; 555 } 556 } 557 558 return align_up(stk_args, 2); 559 } 560 561 // Patch the callers callsite with entry to compiled code if it exists. 562 static void patch_callers_callsite(MacroAssembler *masm) { 563 Label L; 564 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD); 565 __ jcc(Assembler::equal, L); 566 567 // Save the current stack pointer 568 __ mov(r13, rsp); 569 // Schedule the branch target address early. 570 // Call into the VM to patch the caller, then jump to compiled callee 571 // rax isn't live so capture return address while we easily can 572 __ movptr(rax, Address(rsp, 0)); 573 574 // align stack so push_CPU_state doesn't fault 575 __ andptr(rsp, -(StackAlignmentInBytes)); 576 __ push_CPU_state(); 577 __ vzeroupper(); 578 // VM needs caller's callsite 579 // VM needs target method 580 // This needs to be a long call since we will relocate this adapter to 581 // the codeBuffer and it may not reach 582 583 // Allocate argument register save area 584 if (frame::arg_reg_save_area_bytes != 0) { 585 __ subptr(rsp, frame::arg_reg_save_area_bytes); 586 } 587 __ mov(c_rarg0, rbx); 588 __ mov(c_rarg1, rax); 589 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite))); 590 591 // De-allocate argument register save area 592 if (frame::arg_reg_save_area_bytes != 0) { 593 __ addptr(rsp, frame::arg_reg_save_area_bytes); 594 } 595 596 __ vzeroupper(); 597 __ pop_CPU_state(); 598 // restore sp 599 __ mov(rsp, r13); 600 __ bind(L); 601 } 602 603 604 static void gen_c2i_adapter(MacroAssembler *masm, 605 int total_args_passed, 606 int comp_args_on_stack, 607 const BasicType *sig_bt, 608 const VMRegPair *regs, 609 Label& skip_fixup) { 610 // Before we get into the guts of the C2I adapter, see if we should be here 611 // at all. We've come from compiled code and are attempting to jump to the 612 // interpreter, which means the caller made a static call to get here 613 // (vcalls always get a compiled target if there is one). Check for a 614 // compiled target. If there is one, we need to patch the caller's call. 615 patch_callers_callsite(masm); 616 617 __ bind(skip_fixup); 618 619 // Since all args are passed on the stack, total_args_passed * 620 // Interpreter::stackElementSize is the space we need. Plus 1 because 621 // we also account for the return address location since 622 // we store it first rather than hold it in rax across all the shuffling 623 624 int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize; 625 626 // stack is aligned, keep it that way 627 extraspace = align_up(extraspace, 2*wordSize); 628 629 // Get return address 630 __ pop(rax); 631 632 // set senderSP value 633 __ mov(r13, rsp); 634 635 __ subptr(rsp, extraspace); 636 637 // Store the return address in the expected location 638 __ movptr(Address(rsp, 0), rax); 639 640 // Now write the args into the outgoing interpreter space 641 for (int i = 0; i < total_args_passed; i++) { 642 if (sig_bt[i] == T_VOID) { 643 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 644 continue; 645 } 646 647 // offset to start parameters 648 int st_off = (total_args_passed - i) * Interpreter::stackElementSize; 649 int next_off = st_off - Interpreter::stackElementSize; 650 651 // Say 4 args: 652 // i st_off 653 // 0 32 T_LONG 654 // 1 24 T_VOID 655 // 2 16 T_OBJECT 656 // 3 8 T_BOOL 657 // - 0 return address 658 // 659 // However to make thing extra confusing. Because we can fit a long/double in 660 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter 661 // leaves one slot empty and only stores to a single slot. In this case the 662 // slot that is occupied is the T_VOID slot. See I said it was confusing. 663 664 VMReg r_1 = regs[i].first(); 665 VMReg r_2 = regs[i].second(); 666 if (!r_1->is_valid()) { 667 assert(!r_2->is_valid(), ""); 668 continue; 669 } 670 if (r_1->is_stack()) { 671 // memory to memory use rax 672 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace; 673 if (!r_2->is_valid()) { 674 // sign extend?? 675 __ movl(rax, Address(rsp, ld_off)); 676 __ movptr(Address(rsp, st_off), rax); 677 678 } else { 679 680 __ movq(rax, Address(rsp, ld_off)); 681 682 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 683 // T_DOUBLE and T_LONG use two slots in the interpreter 684 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 685 // ld_off == LSW, ld_off+wordSize == MSW 686 // st_off == MSW, next_off == LSW 687 __ movq(Address(rsp, next_off), rax); 688 #ifdef ASSERT 689 // Overwrite the unused slot with known junk 690 __ mov64(rax, CONST64(0xdeadffffdeadaaaa)); 691 __ movptr(Address(rsp, st_off), rax); 692 #endif /* ASSERT */ 693 } else { 694 __ movq(Address(rsp, st_off), rax); 695 } 696 } 697 } else if (r_1->is_Register()) { 698 Register r = r_1->as_Register(); 699 if (!r_2->is_valid()) { 700 // must be only an int (or less ) so move only 32bits to slot 701 // why not sign extend?? 702 __ movl(Address(rsp, st_off), r); 703 } else { 704 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 705 // T_DOUBLE and T_LONG use two slots in the interpreter 706 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 707 // long/double in gpr 708 #ifdef ASSERT 709 // Overwrite the unused slot with known junk 710 __ mov64(rax, CONST64(0xdeadffffdeadaaab)); 711 __ movptr(Address(rsp, st_off), rax); 712 #endif /* ASSERT */ 713 __ movq(Address(rsp, next_off), r); 714 } else { 715 __ movptr(Address(rsp, st_off), r); 716 } 717 } 718 } else { 719 assert(r_1->is_XMMRegister(), ""); 720 if (!r_2->is_valid()) { 721 // only a float use just part of the slot 722 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister()); 723 } else { 724 #ifdef ASSERT 725 // Overwrite the unused slot with known junk 726 __ mov64(rax, CONST64(0xdeadffffdeadaaac)); 727 __ movptr(Address(rsp, st_off), rax); 728 #endif /* ASSERT */ 729 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister()); 730 } 731 } 732 } 733 734 // Schedule the branch target address early. 735 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset()))); 736 __ jmp(rcx); 737 } 738 739 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg, 740 address code_start, address code_end, 741 Label& L_ok) { 742 Label L_fail; 743 __ lea(temp_reg, ExternalAddress(code_start)); 744 __ cmpptr(pc_reg, temp_reg); 745 __ jcc(Assembler::belowEqual, L_fail); 746 __ lea(temp_reg, ExternalAddress(code_end)); 747 __ cmpptr(pc_reg, temp_reg); 748 __ jcc(Assembler::below, L_ok); 749 __ bind(L_fail); 750 } 751 752 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm, 753 int total_args_passed, 754 int comp_args_on_stack, 755 const BasicType *sig_bt, 756 const VMRegPair *regs) { 757 758 // Note: r13 contains the senderSP on entry. We must preserve it since 759 // we may do a i2c -> c2i transition if we lose a race where compiled 760 // code goes non-entrant while we get args ready. 761 // In addition we use r13 to locate all the interpreter args as 762 // we must align the stack to 16 bytes on an i2c entry else we 763 // lose alignment we expect in all compiled code and register 764 // save code can segv when fxsave instructions find improperly 765 // aligned stack pointer. 766 767 // Adapters can be frameless because they do not require the caller 768 // to perform additional cleanup work, such as correcting the stack pointer. 769 // An i2c adapter is frameless because the *caller* frame, which is interpreted, 770 // routinely repairs its own stack pointer (from interpreter_frame_last_sp), 771 // even if a callee has modified the stack pointer. 772 // A c2i adapter is frameless because the *callee* frame, which is interpreted, 773 // routinely repairs its caller's stack pointer (from sender_sp, which is set 774 // up via the senderSP register). 775 // In other words, if *either* the caller or callee is interpreted, we can 776 // get the stack pointer repaired after a call. 777 // This is why c2i and i2c adapters cannot be indefinitely composed. 778 // In particular, if a c2i adapter were to somehow call an i2c adapter, 779 // both caller and callee would be compiled methods, and neither would 780 // clean up the stack pointer changes performed by the two adapters. 781 // If this happens, control eventually transfers back to the compiled 782 // caller, but with an uncorrected stack, causing delayed havoc. 783 784 // Pick up the return address 785 __ movptr(rax, Address(rsp, 0)); 786 787 if (VerifyAdapterCalls && 788 (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) { 789 // So, let's test for cascading c2i/i2c adapters right now. 790 // assert(Interpreter::contains($return_addr) || 791 // StubRoutines::contains($return_addr), 792 // "i2c adapter must return to an interpreter frame"); 793 __ block_comment("verify_i2c { "); 794 Label L_ok; 795 if (Interpreter::code() != NULL) 796 range_check(masm, rax, r11, 797 Interpreter::code()->code_start(), Interpreter::code()->code_end(), 798 L_ok); 799 if (StubRoutines::code1() != NULL) 800 range_check(masm, rax, r11, 801 StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(), 802 L_ok); 803 if (StubRoutines::code2() != NULL) 804 range_check(masm, rax, r11, 805 StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(), 806 L_ok); 807 const char* msg = "i2c adapter must return to an interpreter frame"; 808 __ block_comment(msg); 809 __ stop(msg); 810 __ bind(L_ok); 811 __ block_comment("} verify_i2ce "); 812 } 813 814 // Must preserve original SP for loading incoming arguments because 815 // we need to align the outgoing SP for compiled code. 816 __ movptr(r11, rsp); 817 818 // Cut-out for having no stack args. Since up to 2 int/oop args are passed 819 // in registers, we will occasionally have no stack args. 820 int comp_words_on_stack = 0; 821 if (comp_args_on_stack) { 822 // Sig words on the stack are greater-than VMRegImpl::stack0. Those in 823 // registers are below. By subtracting stack0, we either get a negative 824 // number (all values in registers) or the maximum stack slot accessed. 825 826 // Convert 4-byte c2 stack slots to words. 827 comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord; 828 // Round up to miminum stack alignment, in wordSize 829 comp_words_on_stack = align_up(comp_words_on_stack, 2); 830 __ subptr(rsp, comp_words_on_stack * wordSize); 831 } 832 833 834 // Ensure compiled code always sees stack at proper alignment 835 __ andptr(rsp, -16); 836 837 // push the return address and misalign the stack that youngest frame always sees 838 // as far as the placement of the call instruction 839 __ push(rax); 840 841 // Put saved SP in another register 842 const Register saved_sp = rax; 843 __ movptr(saved_sp, r11); 844 845 // Will jump to the compiled code just as if compiled code was doing it. 846 // Pre-load the register-jump target early, to schedule it better. 847 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset()))); 848 849 #if INCLUDE_JVMCI 850 if (EnableJVMCI) { 851 // check if this call should be routed towards a specific entry point 852 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 853 Label no_alternative_target; 854 __ jcc(Assembler::equal, no_alternative_target); 855 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset()))); 856 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 857 __ bind(no_alternative_target); 858 } 859 #endif // INCLUDE_JVMCI 860 861 // Now generate the shuffle code. Pick up all register args and move the 862 // rest through the floating point stack top. 863 for (int i = 0; i < total_args_passed; i++) { 864 if (sig_bt[i] == T_VOID) { 865 // Longs and doubles are passed in native word order, but misaligned 866 // in the 32-bit build. 867 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 868 continue; 869 } 870 871 // Pick up 0, 1 or 2 words from SP+offset. 872 873 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), 874 "scrambled load targets?"); 875 // Load in argument order going down. 876 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize; 877 // Point to interpreter value (vs. tag) 878 int next_off = ld_off - Interpreter::stackElementSize; 879 // 880 // 881 // 882 VMReg r_1 = regs[i].first(); 883 VMReg r_2 = regs[i].second(); 884 if (!r_1->is_valid()) { 885 assert(!r_2->is_valid(), ""); 886 continue; 887 } 888 if (r_1->is_stack()) { 889 // Convert stack slot to an SP offset (+ wordSize to account for return address ) 890 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize; 891 892 // We can use r13 as a temp here because compiled code doesn't need r13 as an input 893 // and if we end up going thru a c2i because of a miss a reasonable value of r13 894 // will be generated. 895 if (!r_2->is_valid()) { 896 // sign extend??? 897 __ movl(r13, Address(saved_sp, ld_off)); 898 __ movptr(Address(rsp, st_off), r13); 899 } else { 900 // 901 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 902 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 903 // So we must adjust where to pick up the data to match the interpreter. 904 // 905 // Interpreter local[n] == MSW, local[n+1] == LSW however locals 906 // are accessed as negative so LSW is at LOW address 907 908 // ld_off is MSW so get LSW 909 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 910 next_off : ld_off; 911 __ movq(r13, Address(saved_sp, offset)); 912 // st_off is LSW (i.e. reg.first()) 913 __ movq(Address(rsp, st_off), r13); 914 } 915 } else if (r_1->is_Register()) { // Register argument 916 Register r = r_1->as_Register(); 917 assert(r != rax, "must be different"); 918 if (r_2->is_valid()) { 919 // 920 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 921 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 922 // So we must adjust where to pick up the data to match the interpreter. 923 924 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 925 next_off : ld_off; 926 927 // this can be a misaligned move 928 __ movq(r, Address(saved_sp, offset)); 929 } else { 930 // sign extend and use a full word? 931 __ movl(r, Address(saved_sp, ld_off)); 932 } 933 } else { 934 if (!r_2->is_valid()) { 935 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off)); 936 } else { 937 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off)); 938 } 939 } 940 } 941 942 // 6243940 We might end up in handle_wrong_method if 943 // the callee is deoptimized as we race thru here. If that 944 // happens we don't want to take a safepoint because the 945 // caller frame will look interpreted and arguments are now 946 // "compiled" so it is much better to make this transition 947 // invisible to the stack walking code. Unfortunately if 948 // we try and find the callee by normal means a safepoint 949 // is possible. So we stash the desired callee in the thread 950 // and the vm will find there should this case occur. 951 952 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx); 953 954 // put Method* where a c2i would expect should we end up there 955 // only needed becaus eof c2 resolve stubs return Method* as a result in 956 // rax 957 __ mov(rax, rbx); 958 __ jmp(r11); 959 } 960 961 // --------------------------------------------------------------- 962 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm, 963 int total_args_passed, 964 int comp_args_on_stack, 965 const BasicType *sig_bt, 966 const VMRegPair *regs, 967 AdapterFingerPrint* fingerprint) { 968 address i2c_entry = __ pc(); 969 970 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs); 971 972 // ------------------------------------------------------------------------- 973 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls 974 // to the interpreter. The args start out packed in the compiled layout. They 975 // need to be unpacked into the interpreter layout. This will almost always 976 // require some stack space. We grow the current (compiled) stack, then repack 977 // the args. We finally end in a jump to the generic interpreter entry point. 978 // On exit from the interpreter, the interpreter will restore our SP (lest the 979 // compiled code, which relys solely on SP and not RBP, get sick). 980 981 address c2i_unverified_entry = __ pc(); 982 Label skip_fixup; 983 Label ok; 984 985 Register holder = rax; 986 Register receiver = j_rarg0; 987 Register temp = rbx; 988 989 { 990 __ load_klass(temp, receiver, rscratch1); 991 __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset())); 992 __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset())); 993 __ jcc(Assembler::equal, ok); 994 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 995 996 __ bind(ok); 997 // Method might have been compiled since the call site was patched to 998 // interpreted if that is the case treat it as a miss so we can get 999 // the call site corrected. 1000 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD); 1001 __ jcc(Assembler::equal, skip_fixup); 1002 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1003 } 1004 1005 address c2i_entry = __ pc(); 1006 1007 // Class initialization barrier for static methods 1008 address c2i_no_clinit_check_entry = NULL; 1009 if (VM_Version::supports_fast_class_init_checks()) { 1010 Label L_skip_barrier; 1011 Register method = rbx; 1012 1013 { // Bypass the barrier for non-static methods 1014 Register flags = rscratch1; 1015 __ movl(flags, Address(method, Method::access_flags_offset())); 1016 __ testl(flags, JVM_ACC_STATIC); 1017 __ jcc(Assembler::zero, L_skip_barrier); // non-static 1018 } 1019 1020 Register klass = rscratch1; 1021 __ load_method_holder(klass, method); 1022 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1023 1024 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1025 1026 __ bind(L_skip_barrier); 1027 c2i_no_clinit_check_entry = __ pc(); 1028 } 1029 1030 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1031 bs->c2i_entry_barrier(masm); 1032 1033 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup); 1034 1035 return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry); 1036 } 1037 1038 int SharedRuntime::c_calling_convention(const BasicType *sig_bt, 1039 VMRegPair *regs, 1040 VMRegPair *regs2, 1041 int total_args_passed) { 1042 assert(regs2 == NULL, "not needed on x86"); 1043 // We return the amount of VMRegImpl stack slots we need to reserve for all 1044 // the arguments NOT counting out_preserve_stack_slots. 1045 1046 // NOTE: These arrays will have to change when c1 is ported 1047 #ifdef _WIN64 1048 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1049 c_rarg0, c_rarg1, c_rarg2, c_rarg3 1050 }; 1051 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1052 c_farg0, c_farg1, c_farg2, c_farg3 1053 }; 1054 #else 1055 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1056 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5 1057 }; 1058 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1059 c_farg0, c_farg1, c_farg2, c_farg3, 1060 c_farg4, c_farg5, c_farg6, c_farg7 1061 }; 1062 #endif // _WIN64 1063 1064 1065 uint int_args = 0; 1066 uint fp_args = 0; 1067 uint stk_args = 0; // inc by 2 each time 1068 1069 for (int i = 0; i < total_args_passed; i++) { 1070 switch (sig_bt[i]) { 1071 case T_BOOLEAN: 1072 case T_CHAR: 1073 case T_BYTE: 1074 case T_SHORT: 1075 case T_INT: 1076 if (int_args < Argument::n_int_register_parameters_c) { 1077 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 1078 #ifdef _WIN64 1079 fp_args++; 1080 // Allocate slots for callee to stuff register args the stack. 1081 stk_args += 2; 1082 #endif 1083 } else { 1084 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1085 stk_args += 2; 1086 } 1087 break; 1088 case T_LONG: 1089 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1090 // fall through 1091 case T_OBJECT: 1092 case T_ARRAY: 1093 case T_ADDRESS: 1094 case T_METADATA: 1095 if (int_args < Argument::n_int_register_parameters_c) { 1096 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 1097 #ifdef _WIN64 1098 fp_args++; 1099 stk_args += 2; 1100 #endif 1101 } else { 1102 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1103 stk_args += 2; 1104 } 1105 break; 1106 case T_FLOAT: 1107 if (fp_args < Argument::n_float_register_parameters_c) { 1108 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 1109 #ifdef _WIN64 1110 int_args++; 1111 // Allocate slots for callee to stuff register args the stack. 1112 stk_args += 2; 1113 #endif 1114 } else { 1115 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1116 stk_args += 2; 1117 } 1118 break; 1119 case T_DOUBLE: 1120 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1121 if (fp_args < Argument::n_float_register_parameters_c) { 1122 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 1123 #ifdef _WIN64 1124 int_args++; 1125 // Allocate slots for callee to stuff register args the stack. 1126 stk_args += 2; 1127 #endif 1128 } else { 1129 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1130 stk_args += 2; 1131 } 1132 break; 1133 case T_VOID: // Halves of longs and doubles 1134 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 1135 regs[i].set_bad(); 1136 break; 1137 default: 1138 ShouldNotReachHere(); 1139 break; 1140 } 1141 } 1142 #ifdef _WIN64 1143 // windows abi requires that we always allocate enough stack space 1144 // for 4 64bit registers to be stored down. 1145 if (stk_args < 8) { 1146 stk_args = 8; 1147 } 1148 #endif // _WIN64 1149 1150 return stk_args; 1151 } 1152 1153 int SharedRuntime::vector_calling_convention(VMRegPair *regs, 1154 uint num_bits, 1155 uint total_args_passed) { 1156 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512, 1157 "only certain vector sizes are supported for now"); 1158 1159 static const XMMRegister VEC_ArgReg[32] = { 1160 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 1161 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1162 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23, 1163 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31 1164 }; 1165 1166 uint stk_args = 0; 1167 uint fp_args = 0; 1168 1169 for (uint i = 0; i < total_args_passed; i++) { 1170 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg(); 1171 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15)); 1172 regs[i].set_pair(vmreg->next(next_val), vmreg); 1173 } 1174 1175 return stk_args; 1176 } 1177 1178 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1179 // We always ignore the frame_slots arg and just use the space just below frame pointer 1180 // which by this time is free to use 1181 switch (ret_type) { 1182 case T_FLOAT: 1183 __ movflt(Address(rbp, -wordSize), xmm0); 1184 break; 1185 case T_DOUBLE: 1186 __ movdbl(Address(rbp, -wordSize), xmm0); 1187 break; 1188 case T_VOID: break; 1189 default: { 1190 __ movptr(Address(rbp, -wordSize), rax); 1191 } 1192 } 1193 } 1194 1195 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1196 // We always ignore the frame_slots arg and just use the space just below frame pointer 1197 // which by this time is free to use 1198 switch (ret_type) { 1199 case T_FLOAT: 1200 __ movflt(xmm0, Address(rbp, -wordSize)); 1201 break; 1202 case T_DOUBLE: 1203 __ movdbl(xmm0, Address(rbp, -wordSize)); 1204 break; 1205 case T_VOID: break; 1206 default: { 1207 __ movptr(rax, Address(rbp, -wordSize)); 1208 } 1209 } 1210 } 1211 1212 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1213 for ( int i = first_arg ; i < arg_count ; i++ ) { 1214 if (args[i].first()->is_Register()) { 1215 __ push(args[i].first()->as_Register()); 1216 } else if (args[i].first()->is_XMMRegister()) { 1217 __ subptr(rsp, 2*wordSize); 1218 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister()); 1219 } 1220 } 1221 } 1222 1223 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1224 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) { 1225 if (args[i].first()->is_Register()) { 1226 __ pop(args[i].first()->as_Register()); 1227 } else if (args[i].first()->is_XMMRegister()) { 1228 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0)); 1229 __ addptr(rsp, 2*wordSize); 1230 } 1231 } 1232 } 1233 1234 // Unpack an array argument into a pointer to the body and the length 1235 // if the array is non-null, otherwise pass 0 for both. 1236 static void unpack_array_argument(MacroAssembler* masm, VMRegPair reg, BasicType in_elem_type, VMRegPair body_arg, VMRegPair length_arg) { 1237 Register tmp_reg = rax; 1238 assert(!body_arg.first()->is_Register() || body_arg.first()->as_Register() != tmp_reg, 1239 "possible collision"); 1240 assert(!length_arg.first()->is_Register() || length_arg.first()->as_Register() != tmp_reg, 1241 "possible collision"); 1242 1243 __ block_comment("unpack_array_argument {"); 1244 1245 // Pass the length, ptr pair 1246 Label is_null, done; 1247 VMRegPair tmp; 1248 tmp.set_ptr(tmp_reg->as_VMReg()); 1249 if (reg.first()->is_stack()) { 1250 // Load the arg up from the stack 1251 __ move_ptr(reg, tmp); 1252 reg = tmp; 1253 } 1254 __ testptr(reg.first()->as_Register(), reg.first()->as_Register()); 1255 __ jccb(Assembler::equal, is_null); 1256 __ lea(tmp_reg, Address(reg.first()->as_Register(), arrayOopDesc::base_offset_in_bytes(in_elem_type))); 1257 __ move_ptr(tmp, body_arg); 1258 // load the length relative to the body. 1259 __ movl(tmp_reg, Address(tmp_reg, arrayOopDesc::length_offset_in_bytes() - 1260 arrayOopDesc::base_offset_in_bytes(in_elem_type))); 1261 __ move32_64(tmp, length_arg); 1262 __ jmpb(done); 1263 __ bind(is_null); 1264 // Pass zeros 1265 __ xorptr(tmp_reg, tmp_reg); 1266 __ move_ptr(tmp, body_arg); 1267 __ move32_64(tmp, length_arg); 1268 __ bind(done); 1269 1270 __ block_comment("} unpack_array_argument"); 1271 } 1272 1273 1274 // Different signatures may require very different orders for the move 1275 // to avoid clobbering other arguments. There's no simple way to 1276 // order them safely. Compute a safe order for issuing stores and 1277 // break any cycles in those stores. This code is fairly general but 1278 // it's not necessary on the other platforms so we keep it in the 1279 // platform dependent code instead of moving it into a shared file. 1280 // (See bugs 7013347 & 7145024.) 1281 // Note that this code is specific to LP64. 1282 class ComputeMoveOrder: public StackObj { 1283 class MoveOperation: public ResourceObj { 1284 friend class ComputeMoveOrder; 1285 private: 1286 VMRegPair _src; 1287 VMRegPair _dst; 1288 int _src_index; 1289 int _dst_index; 1290 bool _processed; 1291 MoveOperation* _next; 1292 MoveOperation* _prev; 1293 1294 static int get_id(VMRegPair r) { 1295 return r.first()->value(); 1296 } 1297 1298 public: 1299 MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst): 1300 _src(src) 1301 , _dst(dst) 1302 , _src_index(src_index) 1303 , _dst_index(dst_index) 1304 , _processed(false) 1305 , _next(NULL) 1306 , _prev(NULL) { 1307 } 1308 1309 VMRegPair src() const { return _src; } 1310 int src_id() const { return get_id(src()); } 1311 int src_index() const { return _src_index; } 1312 VMRegPair dst() const { return _dst; } 1313 void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; } 1314 int dst_index() const { return _dst_index; } 1315 int dst_id() const { return get_id(dst()); } 1316 MoveOperation* next() const { return _next; } 1317 MoveOperation* prev() const { return _prev; } 1318 void set_processed() { _processed = true; } 1319 bool is_processed() const { return _processed; } 1320 1321 // insert 1322 void break_cycle(VMRegPair temp_register) { 1323 // create a new store following the last store 1324 // to move from the temp_register to the original 1325 MoveOperation* new_store = new MoveOperation(-1, temp_register, dst_index(), dst()); 1326 1327 // break the cycle of links and insert new_store at the end 1328 // break the reverse link. 1329 MoveOperation* p = prev(); 1330 assert(p->next() == this, "must be"); 1331 _prev = NULL; 1332 p->_next = new_store; 1333 new_store->_prev = p; 1334 1335 // change the original store to save it's value in the temp. 1336 set_dst(-1, temp_register); 1337 } 1338 1339 void link(GrowableArray<MoveOperation*>& killer) { 1340 // link this store in front the store that it depends on 1341 MoveOperation* n = killer.at_grow(src_id(), NULL); 1342 if (n != NULL) { 1343 assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet"); 1344 _next = n; 1345 n->_prev = this; 1346 } 1347 } 1348 }; 1349 1350 private: 1351 GrowableArray<MoveOperation*> edges; 1352 1353 public: 1354 ComputeMoveOrder(int total_in_args, const VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs, 1355 const BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) { 1356 // Move operations where the dest is the stack can all be 1357 // scheduled first since they can't interfere with the other moves. 1358 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 1359 if (in_sig_bt[i] == T_ARRAY) { 1360 c_arg--; 1361 if (out_regs[c_arg].first()->is_stack() && 1362 out_regs[c_arg + 1].first()->is_stack()) { 1363 arg_order.push(i); 1364 arg_order.push(c_arg); 1365 } else { 1366 if (out_regs[c_arg].first()->is_stack() || 1367 in_regs[i].first() == out_regs[c_arg].first()) { 1368 add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + 1]); 1369 } else { 1370 add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]); 1371 } 1372 } 1373 } else if (in_sig_bt[i] == T_VOID) { 1374 arg_order.push(i); 1375 arg_order.push(c_arg); 1376 } else { 1377 if (out_regs[c_arg].first()->is_stack() || 1378 in_regs[i].first() == out_regs[c_arg].first()) { 1379 arg_order.push(i); 1380 arg_order.push(c_arg); 1381 } else { 1382 add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]); 1383 } 1384 } 1385 } 1386 // Break any cycles in the register moves and emit the in the 1387 // proper order. 1388 GrowableArray<MoveOperation*>* stores = get_store_order(tmp_vmreg); 1389 for (int i = 0; i < stores->length(); i++) { 1390 arg_order.push(stores->at(i)->src_index()); 1391 arg_order.push(stores->at(i)->dst_index()); 1392 } 1393 } 1394 1395 // Collected all the move operations 1396 void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) { 1397 if (src.first() == dst.first()) return; 1398 edges.append(new MoveOperation(src_index, src, dst_index, dst)); 1399 } 1400 1401 // Walk the edges breaking cycles between moves. The result list 1402 // can be walked in order to produce the proper set of loads 1403 GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) { 1404 // Record which moves kill which values 1405 GrowableArray<MoveOperation*> killer; 1406 for (int i = 0; i < edges.length(); i++) { 1407 MoveOperation* s = edges.at(i); 1408 assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer"); 1409 killer.at_put_grow(s->dst_id(), s, NULL); 1410 } 1411 assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL, 1412 "make sure temp isn't in the registers that are killed"); 1413 1414 // create links between loads and stores 1415 for (int i = 0; i < edges.length(); i++) { 1416 edges.at(i)->link(killer); 1417 } 1418 1419 // at this point, all the move operations are chained together 1420 // in a doubly linked list. Processing it backwards finds 1421 // the beginning of the chain, forwards finds the end. If there's 1422 // a cycle it can be broken at any point, so pick an edge and walk 1423 // backward until the list ends or we end where we started. 1424 GrowableArray<MoveOperation*>* stores = new GrowableArray<MoveOperation*>(); 1425 for (int e = 0; e < edges.length(); e++) { 1426 MoveOperation* s = edges.at(e); 1427 if (!s->is_processed()) { 1428 MoveOperation* start = s; 1429 // search for the beginning of the chain or cycle 1430 while (start->prev() != NULL && start->prev() != s) { 1431 start = start->prev(); 1432 } 1433 if (start->prev() == s) { 1434 start->break_cycle(temp_register); 1435 } 1436 // walk the chain forward inserting to store list 1437 while (start != NULL) { 1438 stores->append(start); 1439 start->set_processed(); 1440 start = start->next(); 1441 } 1442 } 1443 } 1444 return stores; 1445 } 1446 }; 1447 1448 static void verify_oop_args(MacroAssembler* masm, 1449 const methodHandle& method, 1450 const BasicType* sig_bt, 1451 const VMRegPair* regs) { 1452 Register temp_reg = rbx; // not part of any compiled calling seq 1453 if (VerifyOops) { 1454 for (int i = 0; i < method->size_of_parameters(); i++) { 1455 if (is_reference_type(sig_bt[i])) { 1456 VMReg r = regs[i].first(); 1457 assert(r->is_valid(), "bad oop arg"); 1458 if (r->is_stack()) { 1459 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1460 __ verify_oop(temp_reg); 1461 } else { 1462 __ verify_oop(r->as_Register()); 1463 } 1464 } 1465 } 1466 } 1467 } 1468 1469 static void gen_special_dispatch(MacroAssembler* masm, 1470 const methodHandle& method, 1471 const BasicType* sig_bt, 1472 const VMRegPair* regs) { 1473 verify_oop_args(masm, method, sig_bt, regs); 1474 vmIntrinsics::ID iid = method->intrinsic_id(); 1475 1476 // Now write the args into the outgoing interpreter space 1477 bool has_receiver = false; 1478 Register receiver_reg = noreg; 1479 int member_arg_pos = -1; 1480 Register member_reg = noreg; 1481 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid); 1482 if (ref_kind != 0) { 1483 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument 1484 member_reg = rbx; // known to be free at this point 1485 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind); 1486 } else if (iid == vmIntrinsics::_invokeBasic || iid == vmIntrinsics::_linkToNative) { 1487 has_receiver = true; 1488 } else { 1489 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid)); 1490 } 1491 1492 if (member_reg != noreg) { 1493 // Load the member_arg into register, if necessary. 1494 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs); 1495 VMReg r = regs[member_arg_pos].first(); 1496 if (r->is_stack()) { 1497 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1498 } else { 1499 // no data motion is needed 1500 member_reg = r->as_Register(); 1501 } 1502 } 1503 1504 if (has_receiver) { 1505 // Make sure the receiver is loaded into a register. 1506 assert(method->size_of_parameters() > 0, "oob"); 1507 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object"); 1508 VMReg r = regs[0].first(); 1509 assert(r->is_valid(), "bad receiver arg"); 1510 if (r->is_stack()) { 1511 // Porting note: This assumes that compiled calling conventions always 1512 // pass the receiver oop in a register. If this is not true on some 1513 // platform, pick a temp and load the receiver from stack. 1514 fatal("receiver always in a register"); 1515 receiver_reg = j_rarg0; // known to be free at this point 1516 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1517 } else { 1518 // no data motion is needed 1519 receiver_reg = r->as_Register(); 1520 } 1521 } 1522 1523 // Figure out which address we are really jumping to: 1524 MethodHandles::generate_method_handle_dispatch(masm, iid, 1525 receiver_reg, member_reg, /*for_compiler_entry:*/ true); 1526 } 1527 1528 // --------------------------------------------------------------------------- 1529 // Generate a native wrapper for a given method. The method takes arguments 1530 // in the Java compiled code convention, marshals them to the native 1531 // convention (handlizes oops, etc), transitions to native, makes the call, 1532 // returns to java state (possibly blocking), unhandlizes any result and 1533 // returns. 1534 // 1535 // Critical native functions are a shorthand for the use of 1536 // GetPrimtiveArrayCritical and disallow the use of any other JNI 1537 // functions. The wrapper is expected to unpack the arguments before 1538 // passing them to the callee. Critical native functions leave the state _in_Java, 1539 // since they cannot stop for GC. 1540 // Some other parts of JNI setup are skipped like the tear down of the JNI handle 1541 // block and the check for pending exceptions it's impossible for them 1542 // to be thrown. 1543 // 1544 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm, 1545 const methodHandle& method, 1546 int compile_id, 1547 BasicType* in_sig_bt, 1548 VMRegPair* in_regs, 1549 BasicType ret_type, 1550 address critical_entry) { 1551 if (method->is_method_handle_intrinsic()) { 1552 vmIntrinsics::ID iid = method->intrinsic_id(); 1553 intptr_t start = (intptr_t)__ pc(); 1554 int vep_offset = ((intptr_t)__ pc()) - start; 1555 gen_special_dispatch(masm, 1556 method, 1557 in_sig_bt, 1558 in_regs); 1559 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period 1560 __ flush(); 1561 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually 1562 return nmethod::new_native_nmethod(method, 1563 compile_id, 1564 masm->code(), 1565 vep_offset, 1566 frame_complete, 1567 stack_slots / VMRegImpl::slots_per_word, 1568 in_ByteSize(-1), 1569 in_ByteSize(-1), 1570 (OopMapSet*)NULL); 1571 } 1572 bool is_critical_native = true; 1573 address native_func = critical_entry; 1574 if (native_func == NULL) { 1575 native_func = method->native_function(); 1576 is_critical_native = false; 1577 } 1578 assert(native_func != NULL, "must have function"); 1579 1580 // An OopMap for lock (and class if static) 1581 OopMapSet *oop_maps = new OopMapSet(); 1582 intptr_t start = (intptr_t)__ pc(); 1583 1584 // We have received a description of where all the java arg are located 1585 // on entry to the wrapper. We need to convert these args to where 1586 // the jni function will expect them. To figure out where they go 1587 // we convert the java signature to a C signature by inserting 1588 // the hidden arguments as arg[0] and possibly arg[1] (static method) 1589 1590 const int total_in_args = method->size_of_parameters(); 1591 int total_c_args = total_in_args; 1592 if (!is_critical_native) { 1593 total_c_args += 1; 1594 if (method->is_static()) { 1595 total_c_args++; 1596 } 1597 } else { 1598 for (int i = 0; i < total_in_args; i++) { 1599 if (in_sig_bt[i] == T_ARRAY) { 1600 total_c_args++; 1601 } 1602 } 1603 } 1604 1605 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args); 1606 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args); 1607 BasicType* in_elem_bt = NULL; 1608 1609 int argc = 0; 1610 if (!is_critical_native) { 1611 out_sig_bt[argc++] = T_ADDRESS; 1612 if (method->is_static()) { 1613 out_sig_bt[argc++] = T_OBJECT; 1614 } 1615 1616 for (int i = 0; i < total_in_args ; i++ ) { 1617 out_sig_bt[argc++] = in_sig_bt[i]; 1618 } 1619 } else { 1620 in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_in_args); 1621 SignatureStream ss(method->signature()); 1622 for (int i = 0; i < total_in_args ; i++ ) { 1623 if (in_sig_bt[i] == T_ARRAY) { 1624 // Arrays are passed as int, elem* pair 1625 out_sig_bt[argc++] = T_INT; 1626 out_sig_bt[argc++] = T_ADDRESS; 1627 ss.skip_array_prefix(1); // skip one '[' 1628 assert(ss.is_primitive(), "primitive type expected"); 1629 in_elem_bt[i] = ss.type(); 1630 } else { 1631 out_sig_bt[argc++] = in_sig_bt[i]; 1632 in_elem_bt[i] = T_VOID; 1633 } 1634 if (in_sig_bt[i] != T_VOID) { 1635 assert(in_sig_bt[i] == ss.type() || 1636 in_sig_bt[i] == T_ARRAY, "must match"); 1637 ss.next(); 1638 } 1639 } 1640 } 1641 1642 // Now figure out where the args must be stored and how much stack space 1643 // they require. 1644 int out_arg_slots; 1645 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args); 1646 1647 // Compute framesize for the wrapper. We need to handlize all oops in 1648 // incoming registers 1649 1650 // Calculate the total number of stack slots we will need. 1651 1652 // First count the abi requirement plus all of the outgoing args 1653 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots; 1654 1655 // Now the space for the inbound oop handle area 1656 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers 1657 if (is_critical_native) { 1658 // Critical natives may have to call out so they need a save area 1659 // for register arguments. 1660 int double_slots = 0; 1661 int single_slots = 0; 1662 for ( int i = 0; i < total_in_args; i++) { 1663 if (in_regs[i].first()->is_Register()) { 1664 const Register reg = in_regs[i].first()->as_Register(); 1665 switch (in_sig_bt[i]) { 1666 case T_BOOLEAN: 1667 case T_BYTE: 1668 case T_SHORT: 1669 case T_CHAR: 1670 case T_INT: single_slots++; break; 1671 case T_ARRAY: // specific to LP64 (7145024) 1672 case T_LONG: double_slots++; break; 1673 default: ShouldNotReachHere(); 1674 } 1675 } else if (in_regs[i].first()->is_XMMRegister()) { 1676 switch (in_sig_bt[i]) { 1677 case T_FLOAT: single_slots++; break; 1678 case T_DOUBLE: double_slots++; break; 1679 default: ShouldNotReachHere(); 1680 } 1681 } else if (in_regs[i].first()->is_FloatRegister()) { 1682 ShouldNotReachHere(); 1683 } 1684 } 1685 total_save_slots = double_slots * 2 + single_slots; 1686 // align the save area 1687 if (double_slots != 0) { 1688 stack_slots = align_up(stack_slots, 2); 1689 } 1690 } 1691 1692 int oop_handle_offset = stack_slots; 1693 stack_slots += total_save_slots; 1694 1695 // Now any space we need for handlizing a klass if static method 1696 1697 int klass_slot_offset = 0; 1698 int klass_offset = -1; 1699 int lock_slot_offset = 0; 1700 bool is_static = false; 1701 1702 if (method->is_static()) { 1703 klass_slot_offset = stack_slots; 1704 stack_slots += VMRegImpl::slots_per_word; 1705 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size; 1706 is_static = true; 1707 } 1708 1709 // Plus a lock if needed 1710 1711 if (method->is_synchronized()) { 1712 lock_slot_offset = stack_slots; 1713 stack_slots += VMRegImpl::slots_per_word; 1714 } 1715 1716 // Now a place (+2) to save return values or temp during shuffling 1717 // + 4 for return address (which we own) and saved rbp 1718 stack_slots += 6; 1719 1720 // Ok The space we have allocated will look like: 1721 // 1722 // 1723 // FP-> | | 1724 // |---------------------| 1725 // | 2 slots for moves | 1726 // |---------------------| 1727 // | lock box (if sync) | 1728 // |---------------------| <- lock_slot_offset 1729 // | klass (if static) | 1730 // |---------------------| <- klass_slot_offset 1731 // | oopHandle area | 1732 // |---------------------| <- oop_handle_offset (6 java arg registers) 1733 // | outbound memory | 1734 // | based arguments | 1735 // | | 1736 // |---------------------| 1737 // | | 1738 // SP-> | out_preserved_slots | 1739 // 1740 // 1741 1742 1743 // Now compute actual number of stack words we need rounding to make 1744 // stack properly aligned. 1745 stack_slots = align_up(stack_slots, StackAlignmentInSlots); 1746 1747 int stack_size = stack_slots * VMRegImpl::stack_slot_size; 1748 1749 // First thing make an ic check to see if we should even be here 1750 1751 // We are free to use all registers as temps without saving them and 1752 // restoring them except rbp. rbp is the only callee save register 1753 // as far as the interpreter and the compiler(s) are concerned. 1754 1755 1756 const Register ic_reg = rax; 1757 const Register receiver = j_rarg0; 1758 1759 Label hit; 1760 Label exception_pending; 1761 1762 assert_different_registers(ic_reg, receiver, rscratch1); 1763 __ verify_oop(receiver); 1764 __ load_klass(rscratch1, receiver, rscratch2); 1765 __ cmpq(ic_reg, rscratch1); 1766 __ jcc(Assembler::equal, hit); 1767 1768 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1769 1770 // Verified entry point must be aligned 1771 __ align(8); 1772 1773 __ bind(hit); 1774 1775 int vep_offset = ((intptr_t)__ pc()) - start; 1776 1777 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) { 1778 Label L_skip_barrier; 1779 Register klass = r10; 1780 __ mov_metadata(klass, method->method_holder()); // InstanceKlass* 1781 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1782 1783 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1784 1785 __ bind(L_skip_barrier); 1786 } 1787 1788 #ifdef COMPILER1 1789 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available. 1790 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) { 1791 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/); 1792 } 1793 #endif // COMPILER1 1794 1795 // The instruction at the verified entry point must be 5 bytes or longer 1796 // because it can be patched on the fly by make_non_entrant. The stack bang 1797 // instruction fits that requirement. 1798 1799 // Generate stack overflow check 1800 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size()); 1801 1802 // Generate a new frame for the wrapper. 1803 __ enter(); 1804 // -2 because return address is already present and so is saved rbp 1805 __ subptr(rsp, stack_size - 2*wordSize); 1806 1807 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1808 bs->nmethod_entry_barrier(masm); 1809 1810 // Frame is now completed as far as size and linkage. 1811 int frame_complete = ((intptr_t)__ pc()) - start; 1812 1813 if (UseRTMLocking) { 1814 // Abort RTM transaction before calling JNI 1815 // because critical section will be large and will be 1816 // aborted anyway. Also nmethod could be deoptimized. 1817 __ xabort(0); 1818 } 1819 1820 #ifdef ASSERT 1821 { 1822 Label L; 1823 __ mov(rax, rsp); 1824 __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI) 1825 __ cmpptr(rax, rsp); 1826 __ jcc(Assembler::equal, L); 1827 __ stop("improperly aligned stack"); 1828 __ bind(L); 1829 } 1830 #endif /* ASSERT */ 1831 1832 1833 // We use r14 as the oop handle for the receiver/klass 1834 // It is callee save so it survives the call to native 1835 1836 const Register oop_handle_reg = r14; 1837 1838 // 1839 // We immediately shuffle the arguments so that any vm call we have to 1840 // make from here on out (sync slow path, jvmti, etc.) we will have 1841 // captured the oops from our caller and have a valid oopMap for 1842 // them. 1843 1844 // ----------------- 1845 // The Grand Shuffle 1846 1847 // The Java calling convention is either equal (linux) or denser (win64) than the 1848 // c calling convention. However the because of the jni_env argument the c calling 1849 // convention always has at least one more (and two for static) arguments than Java. 1850 // Therefore if we move the args from java -> c backwards then we will never have 1851 // a register->register conflict and we don't have to build a dependency graph 1852 // and figure out how to break any cycles. 1853 // 1854 1855 // Record esp-based slot for receiver on stack for non-static methods 1856 int receiver_offset = -1; 1857 1858 // This is a trick. We double the stack slots so we can claim 1859 // the oops in the caller's frame. Since we are sure to have 1860 // more args than the caller doubling is enough to make 1861 // sure we can capture all the incoming oop args from the 1862 // caller. 1863 // 1864 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/); 1865 1866 // Mark location of rbp (someday) 1867 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp)); 1868 1869 // Use eax, ebx as temporaries during any memory-memory moves we have to do 1870 // All inbound args are referenced based on rbp and all outbound args via rsp. 1871 1872 1873 #ifdef ASSERT 1874 bool reg_destroyed[RegisterImpl::number_of_registers]; 1875 bool freg_destroyed[XMMRegisterImpl::number_of_registers]; 1876 for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) { 1877 reg_destroyed[r] = false; 1878 } 1879 for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) { 1880 freg_destroyed[f] = false; 1881 } 1882 1883 #endif /* ASSERT */ 1884 1885 // This may iterate in two different directions depending on the 1886 // kind of native it is. The reason is that for regular JNI natives 1887 // the incoming and outgoing registers are offset upwards and for 1888 // critical natives they are offset down. 1889 GrowableArray<int> arg_order(2 * total_in_args); 1890 1891 VMRegPair tmp_vmreg; 1892 tmp_vmreg.set2(rbx->as_VMReg()); 1893 1894 if (!is_critical_native) { 1895 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 1896 arg_order.push(i); 1897 arg_order.push(c_arg); 1898 } 1899 } else { 1900 // Compute a valid move order, using tmp_vmreg to break any cycles 1901 ComputeMoveOrder cmo(total_in_args, in_regs, total_c_args, out_regs, in_sig_bt, arg_order, tmp_vmreg); 1902 } 1903 1904 int temploc = -1; 1905 for (int ai = 0; ai < arg_order.length(); ai += 2) { 1906 int i = arg_order.at(ai); 1907 int c_arg = arg_order.at(ai + 1); 1908 __ block_comment(err_msg("move %d -> %d", i, c_arg)); 1909 if (c_arg == -1) { 1910 assert(is_critical_native, "should only be required for critical natives"); 1911 // This arg needs to be moved to a temporary 1912 __ mov(tmp_vmreg.first()->as_Register(), in_regs[i].first()->as_Register()); 1913 in_regs[i] = tmp_vmreg; 1914 temploc = i; 1915 continue; 1916 } else if (i == -1) { 1917 assert(is_critical_native, "should only be required for critical natives"); 1918 // Read from the temporary location 1919 assert(temploc != -1, "must be valid"); 1920 i = temploc; 1921 temploc = -1; 1922 } 1923 #ifdef ASSERT 1924 if (in_regs[i].first()->is_Register()) { 1925 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!"); 1926 } else if (in_regs[i].first()->is_XMMRegister()) { 1927 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!"); 1928 } 1929 if (out_regs[c_arg].first()->is_Register()) { 1930 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 1931 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 1932 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 1933 } 1934 #endif /* ASSERT */ 1935 switch (in_sig_bt[i]) { 1936 case T_ARRAY: 1937 if (is_critical_native) { 1938 unpack_array_argument(masm, in_regs[i], in_elem_bt[i], out_regs[c_arg + 1], out_regs[c_arg]); 1939 c_arg++; 1940 #ifdef ASSERT 1941 if (out_regs[c_arg].first()->is_Register()) { 1942 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 1943 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 1944 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 1945 } 1946 #endif 1947 break; 1948 } 1949 case T_OBJECT: 1950 assert(!is_critical_native, "no oop arguments"); 1951 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg], 1952 ((i == 0) && (!is_static)), 1953 &receiver_offset); 1954 break; 1955 case T_VOID: 1956 break; 1957 1958 case T_FLOAT: 1959 __ float_move(in_regs[i], out_regs[c_arg]); 1960 break; 1961 1962 case T_DOUBLE: 1963 assert( i + 1 < total_in_args && 1964 in_sig_bt[i + 1] == T_VOID && 1965 out_sig_bt[c_arg+1] == T_VOID, "bad arg list"); 1966 __ double_move(in_regs[i], out_regs[c_arg]); 1967 break; 1968 1969 case T_LONG : 1970 __ long_move(in_regs[i], out_regs[c_arg]); 1971 break; 1972 1973 case T_ADDRESS: assert(false, "found T_ADDRESS in java args"); 1974 1975 default: 1976 __ move32_64(in_regs[i], out_regs[c_arg]); 1977 } 1978 } 1979 1980 int c_arg; 1981 1982 // Pre-load a static method's oop into r14. Used both by locking code and 1983 // the normal JNI call code. 1984 if (!is_critical_native) { 1985 // point c_arg at the first arg that is already loaded in case we 1986 // need to spill before we call out 1987 c_arg = total_c_args - total_in_args; 1988 1989 if (method->is_static()) { 1990 1991 // load oop into a register 1992 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror())); 1993 1994 // Now handlize the static class mirror it's known not-null. 1995 __ movptr(Address(rsp, klass_offset), oop_handle_reg); 1996 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset)); 1997 1998 // Now get the handle 1999 __ lea(oop_handle_reg, Address(rsp, klass_offset)); 2000 // store the klass handle as second argument 2001 __ movptr(c_rarg1, oop_handle_reg); 2002 // and protect the arg if we must spill 2003 c_arg--; 2004 } 2005 } else { 2006 // For JNI critical methods we need to save all registers in save_args. 2007 c_arg = 0; 2008 } 2009 2010 // Change state to native (we save the return address in the thread, since it might not 2011 // be pushed on the stack when we do a a stack traversal). It is enough that the pc() 2012 // points into the right code segment. It does not have to be the correct return pc. 2013 // We use the same pc/oopMap repeatedly when we call out 2014 2015 intptr_t the_pc = (intptr_t) __ pc(); 2016 oop_maps->add_gc_map(the_pc - start, map); 2017 2018 __ set_last_Java_frame(rsp, noreg, (address)the_pc); 2019 2020 2021 // We have all of the arguments setup at this point. We must not touch any register 2022 // argument registers at this point (what if we save/restore them there are no oop? 2023 2024 { 2025 SkipIfEqual skip(masm, &DTraceMethodProbes, false); 2026 // protect the args we've loaded 2027 save_args(masm, total_c_args, c_arg, out_regs); 2028 __ mov_metadata(c_rarg1, method()); 2029 __ call_VM_leaf( 2030 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry), 2031 r15_thread, c_rarg1); 2032 restore_args(masm, total_c_args, c_arg, out_regs); 2033 } 2034 2035 // RedefineClasses() tracing support for obsolete method entry 2036 if (log_is_enabled(Trace, redefine, class, obsolete)) { 2037 // protect the args we've loaded 2038 save_args(masm, total_c_args, c_arg, out_regs); 2039 __ mov_metadata(c_rarg1, method()); 2040 __ call_VM_leaf( 2041 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry), 2042 r15_thread, c_rarg1); 2043 restore_args(masm, total_c_args, c_arg, out_regs); 2044 } 2045 2046 // Lock a synchronized method 2047 2048 // Register definitions used by locking and unlocking 2049 2050 const Register swap_reg = rax; // Must use rax for cmpxchg instruction 2051 const Register obj_reg = rbx; // Will contain the oop 2052 const Register lock_reg = r13; // Address of compiler lock object (BasicLock) 2053 const Register old_hdr = r13; // value of old header at unlock time 2054 2055 Label slow_path_lock; 2056 Label lock_done; 2057 2058 if (method->is_synchronized()) { 2059 assert(!is_critical_native, "unhandled"); 2060 2061 2062 const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes(); 2063 2064 // Get the handle (the 2nd argument) 2065 __ mov(oop_handle_reg, c_rarg1); 2066 2067 // Get address of the box 2068 2069 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2070 2071 // Load the oop from the handle 2072 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2073 2074 if (LockingMode == LM_MONITOR) { 2075 __ jmp(slow_path_lock); 2076 } else if (LockingMode == LM_LEGACY) { 2077 if (UseBiasedLocking) { 2078 __ biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch1, rscratch2, false, lock_done, &slow_path_lock); 2079 } 2080 2081 // Load immediate 1 into swap_reg %rax 2082 __ movl(swap_reg, 1); 2083 2084 // Load (object->mark() | 1) into swap_reg %rax 2085 __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2086 2087 // Save (object->mark() | 1) into BasicLock's displaced header 2088 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2089 2090 // src -> dest iff dest == rax else rax <- dest 2091 __ lock(); 2092 __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2093 __ jcc(Assembler::equal, lock_done); 2094 2095 // Hmm should this move to the slow path code area??? 2096 2097 // Test if the oopMark is an obvious stack pointer, i.e., 2098 // 1) (mark & 3) == 0, and 2099 // 2) rsp <= mark < mark + os::pagesize() 2100 // These 3 tests can be done by evaluating the following 2101 // expression: ((mark - rsp) & (3 - os::vm_page_size())), 2102 // assuming both stack pointer and pagesize have their 2103 // least significant 2 bits clear. 2104 // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg 2105 2106 __ subptr(swap_reg, rsp); 2107 __ andptr(swap_reg, 3 - os::vm_page_size()); 2108 2109 // Save the test result, for recursive case, the result is zero 2110 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2111 __ jcc(Assembler::notEqual, slow_path_lock); 2112 } else { 2113 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2114 __ lightweight_lock(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock); 2115 } 2116 2117 // Slow path will re-enter here 2118 2119 __ bind(lock_done); 2120 } 2121 2122 // Finally just about ready to make the JNI call 2123 2124 // get JNIEnv* which is first argument to native 2125 if (!is_critical_native) { 2126 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset()))); 2127 2128 // Now set thread in native 2129 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 2130 } 2131 2132 __ call(RuntimeAddress(native_func)); 2133 2134 // Verify or restore cpu control state after JNI call 2135 __ restore_cpu_control_state_after_jni(); 2136 2137 // Unpack native results. 2138 switch (ret_type) { 2139 case T_BOOLEAN: __ c2bool(rax); break; 2140 case T_CHAR : __ movzwl(rax, rax); break; 2141 case T_BYTE : __ sign_extend_byte (rax); break; 2142 case T_SHORT : __ sign_extend_short(rax); break; 2143 case T_INT : /* nothing to do */ break; 2144 case T_DOUBLE : 2145 case T_FLOAT : 2146 // Result is in xmm0 we'll save as needed 2147 break; 2148 case T_ARRAY: // Really a handle 2149 case T_OBJECT: // Really a handle 2150 break; // can't de-handlize until after safepoint check 2151 case T_VOID: break; 2152 case T_LONG: break; 2153 default : ShouldNotReachHere(); 2154 } 2155 2156 Label after_transition; 2157 2158 // If this is a critical native, check for a safepoint or suspend request after the call. 2159 // If a safepoint is needed, transition to native, then to native_trans to handle 2160 // safepoints like the native methods that are not critical natives. 2161 if (is_critical_native) { 2162 Label needs_safepoint; 2163 __ safepoint_poll(needs_safepoint, r15_thread, false /* at_return */, false /* in_nmethod */); 2164 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2165 __ jcc(Assembler::equal, after_transition); 2166 __ bind(needs_safepoint); 2167 } 2168 2169 // Switch thread to "native transition" state before reading the synchronization state. 2170 // This additional state is necessary because reading and testing the synchronization 2171 // state is not atomic w.r.t. GC, as this scenario demonstrates: 2172 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted. 2173 // VM thread changes sync state to synchronizing and suspends threads for GC. 2174 // Thread A is resumed to finish this native method, but doesn't block here since it 2175 // didn't see any synchronization is progress, and escapes. 2176 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 2177 2178 // Force this write out before the read below 2179 __ membar(Assembler::Membar_mask_bits( 2180 Assembler::LoadLoad | Assembler::LoadStore | 2181 Assembler::StoreLoad | Assembler::StoreStore)); 2182 2183 // check for safepoint operation in progress and/or pending suspend requests 2184 { 2185 Label Continue; 2186 Label slow_path; 2187 2188 __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */); 2189 2190 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2191 __ jcc(Assembler::equal, Continue); 2192 __ bind(slow_path); 2193 2194 // Don't use call_VM as it will see a possible pending exception and forward it 2195 // and never return here preventing us from clearing _last_native_pc down below. 2196 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are 2197 // preserved and correspond to the bcp/locals pointers. So we do a runtime call 2198 // by hand. 2199 // 2200 __ vzeroupper(); 2201 save_native_result(masm, ret_type, stack_slots); 2202 __ mov(c_rarg0, r15_thread); 2203 __ mov(r12, rsp); // remember sp 2204 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2205 __ andptr(rsp, -16); // align stack as required by ABI 2206 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 2207 __ mov(rsp, r12); // restore sp 2208 __ reinit_heapbase(); 2209 // Restore any method result value 2210 restore_native_result(masm, ret_type, stack_slots); 2211 __ bind(Continue); 2212 } 2213 2214 // change thread state 2215 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 2216 __ bind(after_transition); 2217 2218 Label reguard; 2219 Label reguard_done; 2220 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 2221 __ jcc(Assembler::equal, reguard); 2222 __ bind(reguard_done); 2223 2224 // native result if any is live 2225 2226 // Unlock 2227 Label unlock_done; 2228 Label slow_path_unlock; 2229 if (method->is_synchronized()) { 2230 2231 // Get locked oop from the handle we passed to jni 2232 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2233 2234 Label done; 2235 2236 if (UseBiasedLocking) { 2237 __ biased_locking_exit(obj_reg, old_hdr, done); 2238 } 2239 2240 if (LockingMode == LM_LEGACY) { 2241 // Simple recursive lock? 2242 2243 __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD); 2244 __ jcc(Assembler::equal, done); 2245 } 2246 2247 // Must save rax if if it is live now because cmpxchg must use it 2248 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2249 save_native_result(masm, ret_type, stack_slots); 2250 } 2251 2252 if (LockingMode == LM_MONITOR) { 2253 __ jmp(slow_path_unlock); 2254 } else if (LockingMode == LM_LEGACY) { 2255 // get address of the stack lock 2256 __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2257 // get old displaced header 2258 __ movptr(old_hdr, Address(rax, 0)); 2259 2260 // Atomic swap old header if oop still contains the stack lock 2261 __ lock(); 2262 __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2263 __ jcc(Assembler::notEqual, slow_path_unlock); 2264 } else { 2265 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2266 __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock); 2267 } 2268 2269 // slow path re-enters here 2270 __ bind(unlock_done); 2271 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2272 restore_native_result(masm, ret_type, stack_slots); 2273 } 2274 2275 __ bind(done); 2276 2277 } 2278 { 2279 SkipIfEqual skip(masm, &DTraceMethodProbes, false); 2280 save_native_result(masm, ret_type, stack_slots); 2281 __ mov_metadata(c_rarg1, method()); 2282 __ call_VM_leaf( 2283 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit), 2284 r15_thread, c_rarg1); 2285 restore_native_result(masm, ret_type, stack_slots); 2286 } 2287 2288 __ reset_last_Java_frame(false); 2289 2290 // Unbox oop result, e.g. JNIHandles::resolve value. 2291 if (is_reference_type(ret_type)) { 2292 __ resolve_jobject(rax /* value */, 2293 r15_thread /* thread */, 2294 rcx /* tmp */); 2295 } 2296 2297 if (CheckJNICalls) { 2298 // clear_pending_jni_exception_check 2299 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD); 2300 } 2301 2302 if (!is_critical_native) { 2303 // reset handle block 2304 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset())); 2305 __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD); 2306 } 2307 2308 // pop our frame 2309 2310 __ leave(); 2311 2312 if (!is_critical_native) { 2313 // Any exception pending? 2314 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD); 2315 __ jcc(Assembler::notEqual, exception_pending); 2316 } 2317 2318 // Return 2319 2320 __ ret(0); 2321 2322 // Unexpected paths are out of line and go here 2323 2324 if (!is_critical_native) { 2325 // forward the exception 2326 __ bind(exception_pending); 2327 2328 // and forward the exception 2329 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 2330 } 2331 2332 // Slow path locking & unlocking 2333 if (method->is_synchronized()) { 2334 2335 // BEGIN Slow path lock 2336 __ bind(slow_path_lock); 2337 2338 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM 2339 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2340 2341 // protect the args we've loaded 2342 save_args(masm, total_c_args, c_arg, out_regs); 2343 2344 __ mov(c_rarg0, obj_reg); 2345 __ mov(c_rarg1, lock_reg); 2346 __ mov(c_rarg2, r15_thread); 2347 2348 // Not a leaf but we have last_Java_frame setup as we want 2349 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3); 2350 restore_args(masm, total_c_args, c_arg, out_regs); 2351 2352 #ifdef ASSERT 2353 { Label L; 2354 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD); 2355 __ jcc(Assembler::equal, L); 2356 __ stop("no pending exception allowed on exit from monitorenter"); 2357 __ bind(L); 2358 } 2359 #endif 2360 __ jmp(lock_done); 2361 2362 // END Slow path lock 2363 2364 // BEGIN Slow path unlock 2365 __ bind(slow_path_unlock); 2366 2367 // If we haven't already saved the native result we must save it now as xmm registers 2368 // are still exposed. 2369 __ vzeroupper(); 2370 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2371 save_native_result(masm, ret_type, stack_slots); 2372 } 2373 2374 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2375 2376 __ mov(c_rarg0, obj_reg); 2377 __ mov(c_rarg2, r15_thread); 2378 __ mov(r12, rsp); // remember sp 2379 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2380 __ andptr(rsp, -16); // align stack as required by ABI 2381 2382 // Save pending exception around call to VM (which contains an EXCEPTION_MARK) 2383 // NOTE that obj_reg == rbx currently 2384 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset()))); 2385 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD); 2386 2387 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2388 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C))); 2389 __ mov(rsp, r12); // restore sp 2390 __ reinit_heapbase(); 2391 #ifdef ASSERT 2392 { 2393 Label L; 2394 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD); 2395 __ jcc(Assembler::equal, L); 2396 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C"); 2397 __ bind(L); 2398 } 2399 #endif /* ASSERT */ 2400 2401 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx); 2402 2403 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2404 restore_native_result(masm, ret_type, stack_slots); 2405 } 2406 __ jmp(unlock_done); 2407 2408 // END Slow path unlock 2409 2410 } // synchronized 2411 2412 // SLOW PATH Reguard the stack if needed 2413 2414 __ bind(reguard); 2415 __ vzeroupper(); 2416 save_native_result(masm, ret_type, stack_slots); 2417 __ mov(r12, rsp); // remember sp 2418 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2419 __ andptr(rsp, -16); // align stack as required by ABI 2420 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 2421 __ mov(rsp, r12); // restore sp 2422 __ reinit_heapbase(); 2423 restore_native_result(masm, ret_type, stack_slots); 2424 // and continue 2425 __ jmp(reguard_done); 2426 2427 2428 2429 __ flush(); 2430 2431 nmethod *nm = nmethod::new_native_nmethod(method, 2432 compile_id, 2433 masm->code(), 2434 vep_offset, 2435 frame_complete, 2436 stack_slots / VMRegImpl::slots_per_word, 2437 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)), 2438 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size), 2439 oop_maps); 2440 2441 return nm; 2442 } 2443 2444 // this function returns the adjust size (in number of words) to a c2i adapter 2445 // activation for use during deoptimization 2446 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) { 2447 return (callee_locals - callee_parameters) * Interpreter::stackElementWords; 2448 } 2449 2450 2451 uint SharedRuntime::out_preserve_stack_slots() { 2452 return 0; 2453 } 2454 2455 2456 // Number of stack slots between incoming argument block and the start of 2457 // a new frame. The PROLOG must add this many slots to the stack. The 2458 // EPILOG must remove this many slots. amd64 needs two slots for 2459 // return address. 2460 uint SharedRuntime::in_preserve_stack_slots() { 2461 return 4 + 2 * VerifyStackAtCalls; 2462 } 2463 2464 //------------------------------generate_deopt_blob---------------------------- 2465 void SharedRuntime::generate_deopt_blob() { 2466 // Allocate space for the code 2467 ResourceMark rm; 2468 // Setup code generation tools 2469 int pad = 0; 2470 if (UseAVX > 2) { 2471 pad += 1024; 2472 } 2473 #if INCLUDE_JVMCI 2474 if (EnableJVMCI) { 2475 pad += 512; // Increase the buffer size when compiling for JVMCI 2476 } 2477 #endif 2478 CodeBuffer buffer("deopt_blob", 2560+pad, 1024); 2479 MacroAssembler* masm = new MacroAssembler(&buffer); 2480 int frame_size_in_words; 2481 OopMap* map = NULL; 2482 OopMapSet *oop_maps = new OopMapSet(); 2483 2484 // ------------- 2485 // This code enters when returning to a de-optimized nmethod. A return 2486 // address has been pushed on the the stack, and return values are in 2487 // registers. 2488 // If we are doing a normal deopt then we were called from the patched 2489 // nmethod from the point we returned to the nmethod. So the return 2490 // address on the stack is wrong by NativeCall::instruction_size 2491 // We will adjust the value so it looks like we have the original return 2492 // address on the stack (like when we eagerly deoptimized). 2493 // In the case of an exception pending when deoptimizing, we enter 2494 // with a return address on the stack that points after the call we patched 2495 // into the exception handler. We have the following register state from, 2496 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp). 2497 // rax: exception oop 2498 // rbx: exception handler 2499 // rdx: throwing pc 2500 // So in this case we simply jam rdx into the useless return address and 2501 // the stack looks just like we want. 2502 // 2503 // At this point we need to de-opt. We save the argument return 2504 // registers. We call the first C routine, fetch_unroll_info(). This 2505 // routine captures the return values and returns a structure which 2506 // describes the current frame size and the sizes of all replacement frames. 2507 // The current frame is compiled code and may contain many inlined 2508 // functions, each with their own JVM state. We pop the current frame, then 2509 // push all the new frames. Then we call the C routine unpack_frames() to 2510 // populate these frames. Finally unpack_frames() returns us the new target 2511 // address. Notice that callee-save registers are BLOWN here; they have 2512 // already been captured in the vframeArray at the time the return PC was 2513 // patched. 2514 address start = __ pc(); 2515 Label cont; 2516 2517 // Prolog for non exception case! 2518 2519 // Save everything in sight. 2520 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2521 2522 // Normal deoptimization. Save exec mode for unpack_frames. 2523 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved 2524 __ jmp(cont); 2525 2526 int reexecute_offset = __ pc() - start; 2527 #if INCLUDE_JVMCI && !defined(COMPILER1) 2528 if (EnableJVMCI && UseJVMCICompiler) { 2529 // JVMCI does not use this kind of deoptimization 2530 __ should_not_reach_here(); 2531 } 2532 #endif 2533 2534 // Reexecute case 2535 // return address is the pc describes what bci to do re-execute at 2536 2537 // No need to update map as each call to save_live_registers will produce identical oopmap 2538 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2539 2540 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved 2541 __ jmp(cont); 2542 2543 #if INCLUDE_JVMCI 2544 Label after_fetch_unroll_info_call; 2545 int implicit_exception_uncommon_trap_offset = 0; 2546 int uncommon_trap_offset = 0; 2547 2548 if (EnableJVMCI) { 2549 implicit_exception_uncommon_trap_offset = __ pc() - start; 2550 2551 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset()))); 2552 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), (int32_t)NULL_WORD); 2553 2554 uncommon_trap_offset = __ pc() - start; 2555 2556 // Save everything in sight. 2557 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2558 // fetch_unroll_info needs to call last_java_frame() 2559 __ set_last_Java_frame(noreg, noreg, NULL); 2560 2561 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset()))); 2562 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1); 2563 2564 __ movl(r14, (int32_t)Deoptimization::Unpack_reexecute); 2565 __ mov(c_rarg0, r15_thread); 2566 __ movl(c_rarg2, r14); // exec mode 2567 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2568 oop_maps->add_gc_map( __ pc()-start, map->deep_copy()); 2569 2570 __ reset_last_Java_frame(false); 2571 2572 __ jmp(after_fetch_unroll_info_call); 2573 } // EnableJVMCI 2574 #endif // INCLUDE_JVMCI 2575 2576 int exception_offset = __ pc() - start; 2577 2578 // Prolog for exception case 2579 2580 // all registers are dead at this entry point, except for rax, and 2581 // rdx which contain the exception oop and exception pc 2582 // respectively. Set them in TLS and fall thru to the 2583 // unpack_with_exception_in_tls entry point. 2584 2585 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 2586 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax); 2587 2588 int exception_in_tls_offset = __ pc() - start; 2589 2590 // new implementation because exception oop is now passed in JavaThread 2591 2592 // Prolog for exception case 2593 // All registers must be preserved because they might be used by LinearScan 2594 // Exceptiop oop and throwing PC are passed in JavaThread 2595 // tos: stack at point of call to method that threw the exception (i.e. only 2596 // args are on the stack, no return address) 2597 2598 // make room on stack for the return address 2599 // It will be patched later with the throwing pc. The correct value is not 2600 // available now because loading it from memory would destroy registers. 2601 __ push(0); 2602 2603 // Save everything in sight. 2604 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2605 2606 // Now it is safe to overwrite any register 2607 2608 // Deopt during an exception. Save exec mode for unpack_frames. 2609 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved 2610 2611 // load throwing pc from JavaThread and patch it as the return address 2612 // of the current frame. Then clear the field in JavaThread 2613 2614 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2615 __ movptr(Address(rbp, wordSize), rdx); 2616 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD); 2617 2618 #ifdef ASSERT 2619 // verify that there is really an exception oop in JavaThread 2620 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2621 __ verify_oop(rax); 2622 2623 // verify that there is no pending exception 2624 Label no_pending_exception; 2625 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 2626 __ testptr(rax, rax); 2627 __ jcc(Assembler::zero, no_pending_exception); 2628 __ stop("must not have pending exception here"); 2629 __ bind(no_pending_exception); 2630 #endif 2631 2632 __ bind(cont); 2633 2634 // Call C code. Need thread and this frame, but NOT official VM entry 2635 // crud. We cannot block on this call, no GC can happen. 2636 // 2637 // UnrollBlock* fetch_unroll_info(JavaThread* thread) 2638 2639 // fetch_unroll_info needs to call last_java_frame(). 2640 2641 __ set_last_Java_frame(noreg, noreg, NULL); 2642 #ifdef ASSERT 2643 { Label L; 2644 __ cmpptr(Address(r15_thread, 2645 JavaThread::last_Java_fp_offset()), 2646 (int32_t)0); 2647 __ jcc(Assembler::equal, L); 2648 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared"); 2649 __ bind(L); 2650 } 2651 #endif // ASSERT 2652 __ mov(c_rarg0, r15_thread); 2653 __ movl(c_rarg1, r14); // exec_mode 2654 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info))); 2655 2656 // Need to have an oopmap that tells fetch_unroll_info where to 2657 // find any register it might need. 2658 oop_maps->add_gc_map(__ pc() - start, map); 2659 2660 __ reset_last_Java_frame(false); 2661 2662 #if INCLUDE_JVMCI 2663 if (EnableJVMCI) { 2664 __ bind(after_fetch_unroll_info_call); 2665 } 2666 #endif 2667 2668 // Load UnrollBlock* into rdi 2669 __ mov(rdi, rax); 2670 2671 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes())); 2672 Label noException; 2673 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending? 2674 __ jcc(Assembler::notEqual, noException); 2675 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2676 // QQQ this is useless it was NULL above 2677 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2678 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD); 2679 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD); 2680 2681 __ verify_oop(rax); 2682 2683 // Overwrite the result registers with the exception results. 2684 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2685 // I think this is useless 2686 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx); 2687 2688 __ bind(noException); 2689 2690 // Only register save data is on the stack. 2691 // Now restore the result registers. Everything else is either dead 2692 // or captured in the vframeArray. 2693 RegisterSaver::restore_result_registers(masm); 2694 2695 // All of the register save area has been popped of the stack. Only the 2696 // return address remains. 2697 2698 // Pop all the frames we must move/replace. 2699 // 2700 // Frame picture (youngest to oldest) 2701 // 1: self-frame (no frame link) 2702 // 2: deopting frame (no frame link) 2703 // 3: caller of deopting frame (could be compiled/interpreted). 2704 // 2705 // Note: by leaving the return address of self-frame on the stack 2706 // and using the size of frame 2 to adjust the stack 2707 // when we are done the return to frame 3 will still be on the stack. 2708 2709 // Pop deoptimized frame 2710 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes())); 2711 __ addptr(rsp, rcx); 2712 2713 // rsp should be pointing at the return address to the caller (3) 2714 2715 // Pick up the initial fp we should save 2716 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2717 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes())); 2718 2719 #ifdef ASSERT 2720 // Compilers generate code that bang the stack by as much as the 2721 // interpreter would need. So this stack banging should never 2722 // trigger a fault. Verify that it does not on non product builds. 2723 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes())); 2724 __ bang_stack_size(rbx, rcx); 2725 #endif 2726 2727 // Load address of array of frame pcs into rcx 2728 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes())); 2729 2730 // Trash the old pc 2731 __ addptr(rsp, wordSize); 2732 2733 // Load address of array of frame sizes into rsi 2734 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes())); 2735 2736 // Load counter into rdx 2737 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes())); 2738 2739 // Now adjust the caller's stack to make up for the extra locals 2740 // but record the original sp so that we can save it in the skeletal interpreter 2741 // frame and the stack walking of interpreter_sender will get the unextended sp 2742 // value and not the "real" sp value. 2743 2744 const Register sender_sp = r8; 2745 2746 __ mov(sender_sp, rsp); 2747 __ movl(rbx, Address(rdi, 2748 Deoptimization::UnrollBlock:: 2749 caller_adjustment_offset_in_bytes())); 2750 __ subptr(rsp, rbx); 2751 2752 // Push interpreter frames in a loop 2753 Label loop; 2754 __ bind(loop); 2755 __ movptr(rbx, Address(rsi, 0)); // Load frame size 2756 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand 2757 __ pushptr(Address(rcx, 0)); // Save return address 2758 __ enter(); // Save old & set new ebp 2759 __ subptr(rsp, rbx); // Prolog 2760 // This value is corrected by layout_activation_impl 2761 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD ); 2762 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable 2763 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 2764 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 2765 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 2766 __ decrementl(rdx); // Decrement counter 2767 __ jcc(Assembler::notZero, loop); 2768 __ pushptr(Address(rcx, 0)); // Save final return address 2769 2770 // Re-push self-frame 2771 __ enter(); // Save old & set new ebp 2772 2773 // Allocate a full sized register save area. 2774 // Return address and rbp are in place, so we allocate two less words. 2775 __ subptr(rsp, (frame_size_in_words - 2) * wordSize); 2776 2777 // Restore frame locals after moving the frame 2778 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0); 2779 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2780 2781 // Call C code. Need thread but NOT official VM entry 2782 // crud. We cannot block on this call, no GC can happen. Call should 2783 // restore return values to their stack-slots with the new SP. 2784 // 2785 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode) 2786 2787 // Use rbp because the frames look interpreted now 2788 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 2789 // Don't need the precise return PC here, just precise enough to point into this code blob. 2790 address the_pc = __ pc(); 2791 __ set_last_Java_frame(noreg, rbp, the_pc); 2792 2793 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI 2794 __ mov(c_rarg0, r15_thread); 2795 __ movl(c_rarg1, r14); // second arg: exec_mode 2796 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 2797 // Revert SP alignment after call since we're going to do some SP relative addressing below 2798 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset())); 2799 2800 // Set an oopmap for the call site 2801 // Use the same PC we used for the last java frame 2802 oop_maps->add_gc_map(the_pc - start, 2803 new OopMap( frame_size_in_words, 0 )); 2804 2805 // Clear fp AND pc 2806 __ reset_last_Java_frame(true); 2807 2808 // Collect return values 2809 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes())); 2810 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes())); 2811 // I think this is useless (throwing pc?) 2812 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes())); 2813 2814 // Pop self-frame. 2815 __ leave(); // Epilog 2816 2817 // Jump to interpreter 2818 __ ret(0); 2819 2820 // Make sure all code is generated 2821 masm->flush(); 2822 2823 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words); 2824 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset); 2825 #if INCLUDE_JVMCI 2826 if (EnableJVMCI) { 2827 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset); 2828 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset); 2829 } 2830 #endif 2831 } 2832 2833 #ifdef COMPILER2 2834 //------------------------------generate_uncommon_trap_blob-------------------- 2835 void SharedRuntime::generate_uncommon_trap_blob() { 2836 // Allocate space for the code 2837 ResourceMark rm; 2838 // Setup code generation tools 2839 CodeBuffer buffer("uncommon_trap_blob", 2048, 1024); 2840 MacroAssembler* masm = new MacroAssembler(&buffer); 2841 2842 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 2843 2844 address start = __ pc(); 2845 2846 if (UseRTMLocking) { 2847 // Abort RTM transaction before possible nmethod deoptimization. 2848 __ xabort(0); 2849 } 2850 2851 // Push self-frame. We get here with a return address on the 2852 // stack, so rsp is 8-byte aligned until we allocate our frame. 2853 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog! 2854 2855 // No callee saved registers. rbp is assumed implicitly saved 2856 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 2857 2858 // compiler left unloaded_class_index in j_rarg0 move to where the 2859 // runtime expects it. 2860 __ movl(c_rarg1, j_rarg0); 2861 2862 __ set_last_Java_frame(noreg, noreg, NULL); 2863 2864 // Call C code. Need thread but NOT official VM entry 2865 // crud. We cannot block on this call, no GC can happen. Call should 2866 // capture callee-saved registers as well as return values. 2867 // Thread is in rdi already. 2868 // 2869 // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index); 2870 2871 __ mov(c_rarg0, r15_thread); 2872 __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap); 2873 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2874 2875 // Set an oopmap for the call site 2876 OopMapSet* oop_maps = new OopMapSet(); 2877 OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0); 2878 2879 // location of rbp is known implicitly by the frame sender code 2880 2881 oop_maps->add_gc_map(__ pc() - start, map); 2882 2883 __ reset_last_Java_frame(false); 2884 2885 // Load UnrollBlock* into rdi 2886 __ mov(rdi, rax); 2887 2888 #ifdef ASSERT 2889 { Label L; 2890 __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()), 2891 (int32_t)Deoptimization::Unpack_uncommon_trap); 2892 __ jcc(Assembler::equal, L); 2893 __ stop("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap"); 2894 __ bind(L); 2895 } 2896 #endif 2897 2898 // Pop all the frames we must move/replace. 2899 // 2900 // Frame picture (youngest to oldest) 2901 // 1: self-frame (no frame link) 2902 // 2: deopting frame (no frame link) 2903 // 3: caller of deopting frame (could be compiled/interpreted). 2904 2905 // Pop self-frame. We have no frame, and must rely only on rax and rsp. 2906 __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog! 2907 2908 // Pop deoptimized frame (int) 2909 __ movl(rcx, Address(rdi, 2910 Deoptimization::UnrollBlock:: 2911 size_of_deoptimized_frame_offset_in_bytes())); 2912 __ addptr(rsp, rcx); 2913 2914 // rsp should be pointing at the return address to the caller (3) 2915 2916 // Pick up the initial fp we should save 2917 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2918 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes())); 2919 2920 #ifdef ASSERT 2921 // Compilers generate code that bang the stack by as much as the 2922 // interpreter would need. So this stack banging should never 2923 // trigger a fault. Verify that it does not on non product builds. 2924 __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes())); 2925 __ bang_stack_size(rbx, rcx); 2926 #endif 2927 2928 // Load address of array of frame pcs into rcx (address*) 2929 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes())); 2930 2931 // Trash the return pc 2932 __ addptr(rsp, wordSize); 2933 2934 // Load address of array of frame sizes into rsi (intptr_t*) 2935 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes())); 2936 2937 // Counter 2938 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int) 2939 2940 // Now adjust the caller's stack to make up for the extra locals but 2941 // record the original sp so that we can save it in the skeletal 2942 // interpreter frame and the stack walking of interpreter_sender 2943 // will get the unextended sp value and not the "real" sp value. 2944 2945 const Register sender_sp = r8; 2946 2947 __ mov(sender_sp, rsp); 2948 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int) 2949 __ subptr(rsp, rbx); 2950 2951 // Push interpreter frames in a loop 2952 Label loop; 2953 __ bind(loop); 2954 __ movptr(rbx, Address(rsi, 0)); // Load frame size 2955 __ subptr(rbx, 2 * wordSize); // We'll push pc and rbp by hand 2956 __ pushptr(Address(rcx, 0)); // Save return address 2957 __ enter(); // Save old & set new rbp 2958 __ subptr(rsp, rbx); // Prolog 2959 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), 2960 sender_sp); // Make it walkable 2961 // This value is corrected by layout_activation_impl 2962 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD ); 2963 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 2964 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 2965 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 2966 __ decrementl(rdx); // Decrement counter 2967 __ jcc(Assembler::notZero, loop); 2968 __ pushptr(Address(rcx, 0)); // Save final return address 2969 2970 // Re-push self-frame 2971 __ enter(); // Save old & set new rbp 2972 __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt); 2973 // Prolog 2974 2975 // Use rbp because the frames look interpreted now 2976 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 2977 // Don't need the precise return PC here, just precise enough to point into this code blob. 2978 address the_pc = __ pc(); 2979 __ set_last_Java_frame(noreg, rbp, the_pc); 2980 2981 // Call C code. Need thread but NOT official VM entry 2982 // crud. We cannot block on this call, no GC can happen. Call should 2983 // restore return values to their stack-slots with the new SP. 2984 // Thread is in rdi already. 2985 // 2986 // BasicType unpack_frames(JavaThread* thread, int exec_mode); 2987 2988 __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI 2989 __ mov(c_rarg0, r15_thread); 2990 __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap); 2991 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 2992 2993 // Set an oopmap for the call site 2994 // Use the same PC we used for the last java frame 2995 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 2996 2997 // Clear fp AND pc 2998 __ reset_last_Java_frame(true); 2999 3000 // Pop self-frame. 3001 __ leave(); // Epilog 3002 3003 // Jump to interpreter 3004 __ ret(0); 3005 3006 // Make sure all code is generated 3007 masm->flush(); 3008 3009 _uncommon_trap_blob = UncommonTrapBlob::create(&buffer, oop_maps, 3010 SimpleRuntimeFrame::framesize >> 1); 3011 } 3012 #endif // COMPILER2 3013 3014 //------------------------------generate_handler_blob------ 3015 // 3016 // Generate a special Compile2Runtime blob that saves all registers, 3017 // and setup oopmap. 3018 // 3019 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) { 3020 assert(StubRoutines::forward_exception_entry() != NULL, 3021 "must be generated before"); 3022 3023 ResourceMark rm; 3024 OopMapSet *oop_maps = new OopMapSet(); 3025 OopMap* map; 3026 3027 // Allocate space for the code. Setup code generation tools. 3028 CodeBuffer buffer("handler_blob", 2048, 1024); 3029 MacroAssembler* masm = new MacroAssembler(&buffer); 3030 3031 address start = __ pc(); 3032 address call_pc = NULL; 3033 int frame_size_in_words; 3034 bool cause_return = (poll_type == POLL_AT_RETURN); 3035 bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP); 3036 3037 if (UseRTMLocking) { 3038 // Abort RTM transaction before calling runtime 3039 // because critical section will be large and will be 3040 // aborted anyway. Also nmethod could be deoptimized. 3041 __ xabort(0); 3042 } 3043 3044 // Make room for return address (or push it again) 3045 if (!cause_return) { 3046 __ push(rbx); 3047 } 3048 3049 // Save registers, fpu state, and flags 3050 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors); 3051 3052 // The following is basically a call_VM. However, we need the precise 3053 // address of the call in order to generate an oopmap. Hence, we do all the 3054 // work outselves. 3055 3056 __ set_last_Java_frame(noreg, noreg, NULL); 3057 3058 // The return address must always be correct so that frame constructor never 3059 // sees an invalid pc. 3060 3061 if (!cause_return) { 3062 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack. 3063 // Additionally, rbx is a callee saved register and we can look at it later to determine 3064 // if someone changed the return address for us! 3065 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset())); 3066 __ movptr(Address(rbp, wordSize), rbx); 3067 } 3068 3069 // Do the call 3070 __ mov(c_rarg0, r15_thread); 3071 __ call(RuntimeAddress(call_ptr)); 3072 3073 // Set an oopmap for the call site. This oopmap will map all 3074 // oop-registers and debug-info registers as callee-saved. This 3075 // will allow deoptimization at this safepoint to find all possible 3076 // debug-info recordings, as well as let GC find all oops. 3077 3078 oop_maps->add_gc_map( __ pc() - start, map); 3079 3080 Label noException; 3081 3082 __ reset_last_Java_frame(false); 3083 3084 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD); 3085 __ jcc(Assembler::equal, noException); 3086 3087 // Exception pending 3088 3089 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3090 3091 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3092 3093 // No exception case 3094 __ bind(noException); 3095 3096 Label no_adjust; 3097 #ifdef ASSERT 3098 Label bail; 3099 #endif 3100 if (!cause_return) { 3101 Label no_prefix, not_special; 3102 3103 // If our stashed return pc was modified by the runtime we avoid touching it 3104 __ cmpptr(rbx, Address(rbp, wordSize)); 3105 __ jccb(Assembler::notEqual, no_adjust); 3106 3107 // Skip over the poll instruction. 3108 // See NativeInstruction::is_safepoint_poll() 3109 // Possible encodings: 3110 // 85 00 test %eax,(%rax) 3111 // 85 01 test %eax,(%rcx) 3112 // 85 02 test %eax,(%rdx) 3113 // 85 03 test %eax,(%rbx) 3114 // 85 06 test %eax,(%rsi) 3115 // 85 07 test %eax,(%rdi) 3116 // 3117 // 41 85 00 test %eax,(%r8) 3118 // 41 85 01 test %eax,(%r9) 3119 // 41 85 02 test %eax,(%r10) 3120 // 41 85 03 test %eax,(%r11) 3121 // 41 85 06 test %eax,(%r14) 3122 // 41 85 07 test %eax,(%r15) 3123 // 3124 // 85 04 24 test %eax,(%rsp) 3125 // 41 85 04 24 test %eax,(%r12) 3126 // 85 45 00 test %eax,0x0(%rbp) 3127 // 41 85 45 00 test %eax,0x0(%r13) 3128 3129 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix); 3130 __ jcc(Assembler::notEqual, no_prefix); 3131 __ addptr(rbx, 1); 3132 __ bind(no_prefix); 3133 #ifdef ASSERT 3134 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below 3135 #endif 3136 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values: 3137 // r12/rsp 0x04 3138 // r13/rbp 0x05 3139 __ movzbq(rcx, Address(rbx, 1)); 3140 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05 3141 __ subptr(rcx, 4); // looking for 0x00 .. 0x01 3142 __ cmpptr(rcx, 1); 3143 __ jcc(Assembler::above, not_special); 3144 __ addptr(rbx, 1); 3145 __ bind(not_special); 3146 #ifdef ASSERT 3147 // Verify the correct encoding of the poll we're about to skip. 3148 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl); 3149 __ jcc(Assembler::notEqual, bail); 3150 // Mask out the modrm bits 3151 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask); 3152 // rax encodes to 0, so if the bits are nonzero it's incorrect 3153 __ jcc(Assembler::notZero, bail); 3154 #endif 3155 // Adjust return pc forward to step over the safepoint poll instruction 3156 __ addptr(rbx, 2); 3157 __ movptr(Address(rbp, wordSize), rbx); 3158 } 3159 3160 __ bind(no_adjust); 3161 // Normal exit, restore registers and exit. 3162 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3163 __ ret(0); 3164 3165 #ifdef ASSERT 3166 __ bind(bail); 3167 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected"); 3168 #endif 3169 3170 // Make sure all code is generated 3171 masm->flush(); 3172 3173 // Fill-out other meta info 3174 return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words); 3175 } 3176 3177 // 3178 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss 3179 // 3180 // Generate a stub that calls into vm to find out the proper destination 3181 // of a java call. All the argument registers are live at this point 3182 // but since this is generic code we don't know what they are and the caller 3183 // must do any gc of the args. 3184 // 3185 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) { 3186 assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before"); 3187 3188 // allocate space for the code 3189 ResourceMark rm; 3190 3191 CodeBuffer buffer(name, 1000, 512); 3192 MacroAssembler* masm = new MacroAssembler(&buffer); 3193 3194 int frame_size_in_words; 3195 3196 OopMapSet *oop_maps = new OopMapSet(); 3197 OopMap* map = NULL; 3198 3199 int start = __ offset(); 3200 3201 // No need to save vector registers since they are caller-saved anyway. 3202 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false); 3203 3204 int frame_complete = __ offset(); 3205 3206 __ set_last_Java_frame(noreg, noreg, NULL); 3207 3208 __ mov(c_rarg0, r15_thread); 3209 3210 __ call(RuntimeAddress(destination)); 3211 3212 3213 // Set an oopmap for the call site. 3214 // We need this not only for callee-saved registers, but also for volatile 3215 // registers that the compiler might be keeping live across a safepoint. 3216 3217 oop_maps->add_gc_map( __ offset() - start, map); 3218 3219 // rax contains the address we are going to jump to assuming no exception got installed 3220 3221 // clear last_Java_sp 3222 __ reset_last_Java_frame(false); 3223 // check for pending exceptions 3224 Label pending; 3225 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD); 3226 __ jcc(Assembler::notEqual, pending); 3227 3228 // get the returned Method* 3229 __ get_vm_result_2(rbx, r15_thread); 3230 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx); 3231 3232 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 3233 3234 RegisterSaver::restore_live_registers(masm); 3235 3236 // We are back the the original state on entry and ready to go. 3237 3238 __ jmp(rax); 3239 3240 // Pending exception after the safepoint 3241 3242 __ bind(pending); 3243 3244 RegisterSaver::restore_live_registers(masm); 3245 3246 // exception pending => remove activation and forward to exception handler 3247 3248 __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD); 3249 3250 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 3251 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3252 3253 // ------------- 3254 // make sure all code is generated 3255 masm->flush(); 3256 3257 // return the blob 3258 // frame_size_words or bytes?? 3259 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); 3260 } 3261 3262 #ifdef COMPILER2 3263 static const int native_invoker_code_size = MethodHandles::adapter_code_size; 3264 3265 class NativeInvokerGenerator : public StubCodeGenerator { 3266 address _call_target; 3267 int _shadow_space_bytes; 3268 3269 const GrowableArray<VMReg>& _input_registers; 3270 const GrowableArray<VMReg>& _output_registers; 3271 3272 int _frame_complete; 3273 int _framesize; 3274 OopMapSet* _oop_maps; 3275 public: 3276 NativeInvokerGenerator(CodeBuffer* buffer, 3277 address call_target, 3278 int shadow_space_bytes, 3279 const GrowableArray<VMReg>& input_registers, 3280 const GrowableArray<VMReg>& output_registers) 3281 : StubCodeGenerator(buffer, PrintMethodHandleStubs), 3282 _call_target(call_target), 3283 _shadow_space_bytes(shadow_space_bytes), 3284 _input_registers(input_registers), 3285 _output_registers(output_registers), 3286 _frame_complete(0), 3287 _framesize(0), 3288 _oop_maps(NULL) { 3289 assert(_output_registers.length() <= 1 3290 || (_output_registers.length() == 2 && !_output_registers.at(1)->is_valid()), "no multi-reg returns"); 3291 3292 } 3293 3294 void generate(); 3295 3296 int spill_size_in_bytes() const { 3297 if (_output_registers.length() == 0) { 3298 return 0; 3299 } 3300 VMReg reg = _output_registers.at(0); 3301 assert(reg->is_reg(), "must be a register"); 3302 if (reg->is_Register()) { 3303 return 8; 3304 } else if (reg->is_XMMRegister()) { 3305 if (UseAVX >= 3) { 3306 return 64; 3307 } else if (UseAVX >= 1) { 3308 return 32; 3309 } else { 3310 return 16; 3311 } 3312 } else { 3313 ShouldNotReachHere(); 3314 } 3315 return 0; 3316 } 3317 3318 void spill_out_registers() { 3319 if (_output_registers.length() == 0) { 3320 return; 3321 } 3322 VMReg reg = _output_registers.at(0); 3323 assert(reg->is_reg(), "must be a register"); 3324 MacroAssembler* masm = _masm; 3325 if (reg->is_Register()) { 3326 __ movptr(Address(rsp, 0), reg->as_Register()); 3327 } else if (reg->is_XMMRegister()) { 3328 if (UseAVX >= 3) { 3329 __ evmovdqul(Address(rsp, 0), reg->as_XMMRegister(), Assembler::AVX_512bit); 3330 } else if (UseAVX >= 1) { 3331 __ vmovdqu(Address(rsp, 0), reg->as_XMMRegister()); 3332 } else { 3333 __ movdqu(Address(rsp, 0), reg->as_XMMRegister()); 3334 } 3335 } else { 3336 ShouldNotReachHere(); 3337 } 3338 } 3339 3340 void fill_out_registers() { 3341 if (_output_registers.length() == 0) { 3342 return; 3343 } 3344 VMReg reg = _output_registers.at(0); 3345 assert(reg->is_reg(), "must be a register"); 3346 MacroAssembler* masm = _masm; 3347 if (reg->is_Register()) { 3348 __ movptr(reg->as_Register(), Address(rsp, 0)); 3349 } else if (reg->is_XMMRegister()) { 3350 if (UseAVX >= 3) { 3351 __ evmovdqul(reg->as_XMMRegister(), Address(rsp, 0), Assembler::AVX_512bit); 3352 } else if (UseAVX >= 1) { 3353 __ vmovdqu(reg->as_XMMRegister(), Address(rsp, 0)); 3354 } else { 3355 __ movdqu(reg->as_XMMRegister(), Address(rsp, 0)); 3356 } 3357 } else { 3358 ShouldNotReachHere(); 3359 } 3360 } 3361 3362 int frame_complete() const { 3363 return _frame_complete; 3364 } 3365 3366 int framesize() const { 3367 return (_framesize >> (LogBytesPerWord - LogBytesPerInt)); 3368 } 3369 3370 OopMapSet* oop_maps() const { 3371 return _oop_maps; 3372 } 3373 3374 private: 3375 #ifdef ASSERT 3376 bool target_uses_register(VMReg reg) { 3377 return _input_registers.contains(reg) || _output_registers.contains(reg); 3378 } 3379 #endif 3380 }; 3381 3382 RuntimeStub* SharedRuntime::make_native_invoker(address call_target, 3383 int shadow_space_bytes, 3384 const GrowableArray<VMReg>& input_registers, 3385 const GrowableArray<VMReg>& output_registers) { 3386 int locs_size = 64; 3387 CodeBuffer code("nep_invoker_blob", native_invoker_code_size, locs_size); 3388 NativeInvokerGenerator g(&code, call_target, shadow_space_bytes, input_registers, output_registers); 3389 g.generate(); 3390 code.log_section_sizes("nep_invoker_blob"); 3391 3392 RuntimeStub* stub = 3393 RuntimeStub::new_runtime_stub("nep_invoker_blob", 3394 &code, 3395 g.frame_complete(), 3396 g.framesize(), 3397 g.oop_maps(), false); 3398 return stub; 3399 } 3400 3401 void NativeInvokerGenerator::generate() { 3402 assert(!(target_uses_register(r15_thread->as_VMReg()) || target_uses_register(rscratch1->as_VMReg())), "Register conflict"); 3403 3404 enum layout { 3405 rbp_off, 3406 rbp_off2, 3407 return_off, 3408 return_off2, 3409 framesize // inclusive of return address 3410 }; 3411 3412 _framesize = align_up(framesize + ((_shadow_space_bytes + spill_size_in_bytes()) >> LogBytesPerInt), 4); 3413 assert(is_even(_framesize/2), "sp not 16-byte aligned"); 3414 3415 _oop_maps = new OopMapSet(); 3416 MacroAssembler* masm = _masm; 3417 3418 address start = __ pc(); 3419 3420 __ enter(); 3421 3422 // return address and rbp are already in place 3423 __ subptr(rsp, (_framesize-4) << LogBytesPerInt); // prolog 3424 3425 _frame_complete = __ pc() - start; 3426 3427 address the_pc = __ pc(); 3428 3429 __ set_last_Java_frame(rsp, rbp, (address)the_pc); 3430 OopMap* map = new OopMap(_framesize, 0); 3431 _oop_maps->add_gc_map(the_pc - start, map); 3432 3433 // State transition 3434 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 3435 3436 __ call(RuntimeAddress(_call_target)); 3437 3438 __ restore_cpu_control_state_after_jni(); 3439 3440 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 3441 3442 // Force this write out before the read below 3443 __ membar(Assembler::Membar_mask_bits( 3444 Assembler::LoadLoad | Assembler::LoadStore | 3445 Assembler::StoreLoad | Assembler::StoreStore)); 3446 3447 Label L_after_safepoint_poll; 3448 Label L_safepoint_poll_slow_path; 3449 3450 __ safepoint_poll(L_safepoint_poll_slow_path, r15_thread, true /* at_return */, false /* in_nmethod */); 3451 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 3452 __ jcc(Assembler::notEqual, L_safepoint_poll_slow_path); 3453 3454 __ bind(L_after_safepoint_poll); 3455 3456 // change thread state 3457 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 3458 3459 __ block_comment("reguard stack check"); 3460 Label L_reguard; 3461 Label L_after_reguard; 3462 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 3463 __ jcc(Assembler::equal, L_reguard); 3464 __ bind(L_after_reguard); 3465 3466 __ reset_last_Java_frame(r15_thread, true); 3467 3468 __ leave(); // required for proper stackwalking of RuntimeStub frame 3469 __ ret(0); 3470 3471 ////////////////////////////////////////////////////////////////////////////// 3472 3473 __ block_comment("{ L_safepoint_poll_slow_path"); 3474 __ bind(L_safepoint_poll_slow_path); 3475 __ vzeroupper(); 3476 3477 spill_out_registers(); 3478 3479 __ mov(c_rarg0, r15_thread); 3480 __ mov(r12, rsp); // remember sp 3481 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 3482 __ andptr(rsp, -16); // align stack as required by ABI 3483 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 3484 __ mov(rsp, r12); // restore sp 3485 __ reinit_heapbase(); 3486 3487 fill_out_registers(); 3488 3489 __ jmp(L_after_safepoint_poll); 3490 __ block_comment("} L_safepoint_poll_slow_path"); 3491 3492 ////////////////////////////////////////////////////////////////////////////// 3493 3494 __ block_comment("{ L_reguard"); 3495 __ bind(L_reguard); 3496 __ vzeroupper(); 3497 3498 spill_out_registers(); 3499 3500 __ mov(r12, rsp); // remember sp 3501 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 3502 __ andptr(rsp, -16); // align stack as required by ABI 3503 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 3504 __ mov(rsp, r12); // restore sp 3505 __ reinit_heapbase(); 3506 3507 fill_out_registers(); 3508 3509 __ jmp(L_after_reguard); 3510 3511 __ block_comment("} L_reguard"); 3512 3513 ////////////////////////////////////////////////////////////////////////////// 3514 3515 __ flush(); 3516 } 3517 #endif // COMPILER2 3518 3519 //------------------------------Montgomery multiplication------------------------ 3520 // 3521 3522 #ifndef _WINDOWS 3523 3524 // Subtract 0:b from carry:a. Return carry. 3525 static julong 3526 sub(julong a[], julong b[], julong carry, long len) { 3527 long long i = 0, cnt = len; 3528 julong tmp; 3529 asm volatile("clc; " 3530 "0: ; " 3531 "mov (%[b], %[i], 8), %[tmp]; " 3532 "sbb %[tmp], (%[a], %[i], 8); " 3533 "inc %[i]; dec %[cnt]; " 3534 "jne 0b; " 3535 "mov %[carry], %[tmp]; sbb $0, %[tmp]; " 3536 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp) 3537 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry) 3538 : "memory"); 3539 return tmp; 3540 } 3541 3542 // Multiply (unsigned) Long A by Long B, accumulating the double- 3543 // length result into the accumulator formed of T0, T1, and T2. 3544 #define MACC(A, B, T0, T1, T2) \ 3545 do { \ 3546 unsigned long hi, lo; \ 3547 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3548 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3549 : "r"(A), "a"(B) : "cc"); \ 3550 } while(0) 3551 3552 // As above, but add twice the double-length result into the 3553 // accumulator. 3554 #define MACC2(A, B, T0, T1, T2) \ 3555 do { \ 3556 unsigned long hi, lo; \ 3557 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \ 3558 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3559 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3560 : "r"(A), "a"(B) : "cc"); \ 3561 } while(0) 3562 3563 #else //_WINDOWS 3564 3565 static julong 3566 sub(julong a[], julong b[], julong carry, long len) { 3567 long i; 3568 julong tmp; 3569 unsigned char c = 1; 3570 for (i = 0; i < len; i++) { 3571 c = _addcarry_u64(c, a[i], ~b[i], &tmp); 3572 a[i] = tmp; 3573 } 3574 c = _addcarry_u64(c, carry, ~0, &tmp); 3575 return tmp; 3576 } 3577 3578 // Multiply (unsigned) Long A by Long B, accumulating the double- 3579 // length result into the accumulator formed of T0, T1, and T2. 3580 #define MACC(A, B, T0, T1, T2) \ 3581 do { \ 3582 julong hi, lo; \ 3583 lo = _umul128(A, B, &hi); \ 3584 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3585 c = _addcarry_u64(c, hi, T1, &T1); \ 3586 _addcarry_u64(c, T2, 0, &T2); \ 3587 } while(0) 3588 3589 // As above, but add twice the double-length result into the 3590 // accumulator. 3591 #define MACC2(A, B, T0, T1, T2) \ 3592 do { \ 3593 julong hi, lo; \ 3594 lo = _umul128(A, B, &hi); \ 3595 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3596 c = _addcarry_u64(c, hi, T1, &T1); \ 3597 _addcarry_u64(c, T2, 0, &T2); \ 3598 c = _addcarry_u64(0, lo, T0, &T0); \ 3599 c = _addcarry_u64(c, hi, T1, &T1); \ 3600 _addcarry_u64(c, T2, 0, &T2); \ 3601 } while(0) 3602 3603 #endif //_WINDOWS 3604 3605 // Fast Montgomery multiplication. The derivation of the algorithm is 3606 // in A Cryptographic Library for the Motorola DSP56000, 3607 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3608 3609 static void NOINLINE 3610 montgomery_multiply(julong a[], julong b[], julong n[], 3611 julong m[], julong inv, int len) { 3612 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3613 int i; 3614 3615 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply"); 3616 3617 for (i = 0; i < len; i++) { 3618 int j; 3619 for (j = 0; j < i; j++) { 3620 MACC(a[j], b[i-j], t0, t1, t2); 3621 MACC(m[j], n[i-j], t0, t1, t2); 3622 } 3623 MACC(a[i], b[0], t0, t1, t2); 3624 m[i] = t0 * inv; 3625 MACC(m[i], n[0], t0, t1, t2); 3626 3627 assert(t0 == 0, "broken Montgomery multiply"); 3628 3629 t0 = t1; t1 = t2; t2 = 0; 3630 } 3631 3632 for (i = len; i < 2*len; i++) { 3633 int j; 3634 for (j = i-len+1; j < len; j++) { 3635 MACC(a[j], b[i-j], t0, t1, t2); 3636 MACC(m[j], n[i-j], t0, t1, t2); 3637 } 3638 m[i-len] = t0; 3639 t0 = t1; t1 = t2; t2 = 0; 3640 } 3641 3642 while (t0) 3643 t0 = sub(m, n, t0, len); 3644 } 3645 3646 // Fast Montgomery squaring. This uses asymptotically 25% fewer 3647 // multiplies so it should be up to 25% faster than Montgomery 3648 // multiplication. However, its loop control is more complex and it 3649 // may actually run slower on some machines. 3650 3651 static void NOINLINE 3652 montgomery_square(julong a[], julong n[], 3653 julong m[], julong inv, int len) { 3654 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3655 int i; 3656 3657 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square"); 3658 3659 for (i = 0; i < len; i++) { 3660 int j; 3661 int end = (i+1)/2; 3662 for (j = 0; j < end; j++) { 3663 MACC2(a[j], a[i-j], t0, t1, t2); 3664 MACC(m[j], n[i-j], t0, t1, t2); 3665 } 3666 if ((i & 1) == 0) { 3667 MACC(a[j], a[j], t0, t1, t2); 3668 } 3669 for (; j < i; j++) { 3670 MACC(m[j], n[i-j], t0, t1, t2); 3671 } 3672 m[i] = t0 * inv; 3673 MACC(m[i], n[0], t0, t1, t2); 3674 3675 assert(t0 == 0, "broken Montgomery square"); 3676 3677 t0 = t1; t1 = t2; t2 = 0; 3678 } 3679 3680 for (i = len; i < 2*len; i++) { 3681 int start = i-len+1; 3682 int end = start + (len - start)/2; 3683 int j; 3684 for (j = start; j < end; j++) { 3685 MACC2(a[j], a[i-j], t0, t1, t2); 3686 MACC(m[j], n[i-j], t0, t1, t2); 3687 } 3688 if ((i & 1) == 0) { 3689 MACC(a[j], a[j], t0, t1, t2); 3690 } 3691 for (; j < len; j++) { 3692 MACC(m[j], n[i-j], t0, t1, t2); 3693 } 3694 m[i-len] = t0; 3695 t0 = t1; t1 = t2; t2 = 0; 3696 } 3697 3698 while (t0) 3699 t0 = sub(m, n, t0, len); 3700 } 3701 3702 // Swap words in a longword. 3703 static julong swap(julong x) { 3704 return (x << 32) | (x >> 32); 3705 } 3706 3707 // Copy len longwords from s to d, word-swapping as we go. The 3708 // destination array is reversed. 3709 static void reverse_words(julong *s, julong *d, int len) { 3710 d += len; 3711 while(len-- > 0) { 3712 d--; 3713 *d = swap(*s); 3714 s++; 3715 } 3716 } 3717 3718 // The threshold at which squaring is advantageous was determined 3719 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. 3720 #define MONTGOMERY_SQUARING_THRESHOLD 64 3721 3722 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, 3723 jint len, jlong inv, 3724 jint *m_ints) { 3725 assert(len % 2 == 0, "array length in montgomery_multiply must be even"); 3726 int longwords = len/2; 3727 3728 // Make very sure we don't use so much space that the stack might 3729 // overflow. 512 jints corresponds to an 16384-bit integer and 3730 // will use here a total of 8k bytes of stack space. 3731 int divisor = sizeof(julong) * 4; 3732 guarantee(longwords <= 8192 / divisor, "must be"); 3733 int total_allocation = longwords * sizeof (julong) * 4; 3734 julong *scratch = (julong *)alloca(total_allocation); 3735 3736 // Local scratch arrays 3737 julong 3738 *a = scratch + 0 * longwords, 3739 *b = scratch + 1 * longwords, 3740 *n = scratch + 2 * longwords, 3741 *m = scratch + 3 * longwords; 3742 3743 reverse_words((julong *)a_ints, a, longwords); 3744 reverse_words((julong *)b_ints, b, longwords); 3745 reverse_words((julong *)n_ints, n, longwords); 3746 3747 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords); 3748 3749 reverse_words(m, (julong *)m_ints, longwords); 3750 } 3751 3752 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, 3753 jint len, jlong inv, 3754 jint *m_ints) { 3755 assert(len % 2 == 0, "array length in montgomery_square must be even"); 3756 int longwords = len/2; 3757 3758 // Make very sure we don't use so much space that the stack might 3759 // overflow. 512 jints corresponds to an 16384-bit integer and 3760 // will use here a total of 6k bytes of stack space. 3761 int divisor = sizeof(julong) * 3; 3762 guarantee(longwords <= (8192 / divisor), "must be"); 3763 int total_allocation = longwords * sizeof (julong) * 3; 3764 julong *scratch = (julong *)alloca(total_allocation); 3765 3766 // Local scratch arrays 3767 julong 3768 *a = scratch + 0 * longwords, 3769 *n = scratch + 1 * longwords, 3770 *m = scratch + 2 * longwords; 3771 3772 reverse_words((julong *)a_ints, a, longwords); 3773 reverse_words((julong *)n_ints, n, longwords); 3774 3775 if (len >= MONTGOMERY_SQUARING_THRESHOLD) { 3776 ::montgomery_square(a, n, m, (julong)inv, longwords); 3777 } else { 3778 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords); 3779 } 3780 3781 reverse_words(m, (julong *)m_ints, longwords); 3782 } 3783 3784 #ifdef COMPILER2 3785 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame 3786 // 3787 //------------------------------generate_exception_blob--------------------------- 3788 // creates exception blob at the end 3789 // Using exception blob, this code is jumped from a compiled method. 3790 // (see emit_exception_handler in x86_64.ad file) 3791 // 3792 // Given an exception pc at a call we call into the runtime for the 3793 // handler in this method. This handler might merely restore state 3794 // (i.e. callee save registers) unwind the frame and jump to the 3795 // exception handler for the nmethod if there is no Java level handler 3796 // for the nmethod. 3797 // 3798 // This code is entered with a jmp. 3799 // 3800 // Arguments: 3801 // rax: exception oop 3802 // rdx: exception pc 3803 // 3804 // Results: 3805 // rax: exception oop 3806 // rdx: exception pc in caller or ??? 3807 // destination: exception handler of caller 3808 // 3809 // Note: the exception pc MUST be at a call (precise debug information) 3810 // Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved. 3811 // 3812 3813 void OptoRuntime::generate_exception_blob() { 3814 assert(!OptoRuntime::is_callee_saved_register(RDX_num), ""); 3815 assert(!OptoRuntime::is_callee_saved_register(RAX_num), ""); 3816 assert(!OptoRuntime::is_callee_saved_register(RCX_num), ""); 3817 3818 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 3819 3820 // Allocate space for the code 3821 ResourceMark rm; 3822 // Setup code generation tools 3823 CodeBuffer buffer("exception_blob", 2048, 1024); 3824 MacroAssembler* masm = new MacroAssembler(&buffer); 3825 3826 3827 address start = __ pc(); 3828 3829 // Exception pc is 'return address' for stack walker 3830 __ push(rdx); 3831 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog 3832 3833 // Save callee-saved registers. See x86_64.ad. 3834 3835 // rbp is an implicitly saved callee saved register (i.e., the calling 3836 // convention will save/restore it in the prolog/epilog). Other than that 3837 // there are no callee save registers now that adapter frames are gone. 3838 3839 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 3840 3841 // Store exception in Thread object. We cannot pass any arguments to the 3842 // handle_exception call, since we do not want to make any assumption 3843 // about the size of the frame where the exception happened in. 3844 // c_rarg0 is either rdi (Linux) or rcx (Windows). 3845 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax); 3846 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 3847 3848 // This call does all the hard work. It checks if an exception handler 3849 // exists in the method. 3850 // If so, it returns the handler address. 3851 // If not, it prepares for stack-unwinding, restoring the callee-save 3852 // registers of the frame being removed. 3853 // 3854 // address OptoRuntime::handle_exception_C(JavaThread* thread) 3855 3856 // At a method handle call, the stack may not be properly aligned 3857 // when returning with an exception. 3858 address the_pc = __ pc(); 3859 __ set_last_Java_frame(noreg, noreg, the_pc); 3860 __ mov(c_rarg0, r15_thread); 3861 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack 3862 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C))); 3863 3864 // Set an oopmap for the call site. This oopmap will only be used if we 3865 // are unwinding the stack. Hence, all locations will be dead. 3866 // Callee-saved registers will be the same as the frame above (i.e., 3867 // handle_exception_stub), since they were restored when we got the 3868 // exception. 3869 3870 OopMapSet* oop_maps = new OopMapSet(); 3871 3872 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 3873 3874 __ reset_last_Java_frame(false); 3875 3876 // Restore callee-saved registers 3877 3878 // rbp is an implicitly saved callee-saved register (i.e., the calling 3879 // convention will save restore it in prolog/epilog) Other than that 3880 // there are no callee save registers now that adapter frames are gone. 3881 3882 __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt)); 3883 3884 __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog 3885 __ pop(rdx); // No need for exception pc anymore 3886 3887 // rax: exception handler 3888 3889 // We have a handler in rax (could be deopt blob). 3890 __ mov(r8, rax); 3891 3892 // Get the exception oop 3893 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 3894 // Get the exception pc in case we are deoptimized 3895 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 3896 #ifdef ASSERT 3897 __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD); 3898 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD); 3899 #endif 3900 // Clear the exception oop so GC no longer processes it as a root. 3901 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD); 3902 3903 // rax: exception oop 3904 // r8: exception handler 3905 // rdx: exception pc 3906 // Jump to handler 3907 3908 __ jmp(r8); 3909 3910 // Make sure all code is generated 3911 masm->flush(); 3912 3913 // Set exception blob 3914 _exception_blob = ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1); 3915 } 3916 #endif // COMPILER2 3917 3918 void SharedRuntime::compute_move_order(const BasicType* in_sig_bt, 3919 int total_in_args, const VMRegPair* in_regs, 3920 int total_out_args, VMRegPair* out_regs, 3921 GrowableArray<int>& arg_order, 3922 VMRegPair tmp_vmreg) { 3923 ComputeMoveOrder order(total_in_args, in_regs, 3924 total_out_args, out_regs, 3925 in_sig_bt, arg_order, tmp_vmreg); 3926 }