1 /* 2 * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #ifndef _WINDOWS 27 #include "alloca.h" 28 #endif 29 #include "asm/macroAssembler.hpp" 30 #include "asm/macroAssembler.inline.hpp" 31 #include "code/debugInfoRec.hpp" 32 #include "code/icBuffer.hpp" 33 #include "code/nativeInst.hpp" 34 #include "code/vtableStubs.hpp" 35 #include "compiler/oopMap.hpp" 36 #include "gc/shared/collectedHeap.hpp" 37 #include "gc/shared/gcLocker.hpp" 38 #include "gc/shared/barrierSet.hpp" 39 #include "gc/shared/barrierSetAssembler.hpp" 40 #include "interpreter/interpreter.hpp" 41 #include "logging/log.hpp" 42 #include "memory/resourceArea.hpp" 43 #include "memory/universe.hpp" 44 #include "oops/compiledICHolder.hpp" 45 #include "oops/klass.inline.hpp" 46 #include "prims/methodHandles.hpp" 47 #include "runtime/jniHandles.hpp" 48 #include "runtime/safepointMechanism.hpp" 49 #include "runtime/sharedRuntime.hpp" 50 #include "runtime/signature.hpp" 51 #include "runtime/stubRoutines.hpp" 52 #include "runtime/vframeArray.hpp" 53 #include "runtime/vm_version.hpp" 54 #include "utilities/align.hpp" 55 #include "utilities/formatBuffer.hpp" 56 #include "vmreg_x86.inline.hpp" 57 #ifdef COMPILER1 58 #include "c1/c1_Runtime1.hpp" 59 #endif 60 #ifdef COMPILER2 61 #include "opto/runtime.hpp" 62 #endif 63 #if INCLUDE_JVMCI 64 #include "jvmci/jvmciJavaClasses.hpp" 65 #endif 66 67 #define __ masm-> 68 69 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size; 70 71 class SimpleRuntimeFrame { 72 73 public: 74 75 // Most of the runtime stubs have this simple frame layout. 76 // This class exists to make the layout shared in one place. 77 // Offsets are for compiler stack slots, which are jints. 78 enum layout { 79 // The frame sender code expects that rbp will be in the "natural" place and 80 // will override any oopMap setting for it. We must therefore force the layout 81 // so that it agrees with the frame sender code. 82 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt, 83 rbp_off2, 84 return_off, return_off2, 85 framesize 86 }; 87 }; 88 89 class RegisterSaver { 90 // Capture info about frame layout. Layout offsets are in jint 91 // units because compiler frame slots are jints. 92 #define XSAVE_AREA_BEGIN 160 93 #define XSAVE_AREA_YMM_BEGIN 576 94 #define XSAVE_AREA_OPMASK_BEGIN 1088 95 #define XSAVE_AREA_ZMM_BEGIN 1152 96 #define XSAVE_AREA_UPPERBANK 1664 97 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off 98 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off 99 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off 100 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off 101 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off 102 enum layout { 103 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area 104 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area 105 DEF_XMM_OFFS(0), 106 DEF_XMM_OFFS(1), 107 // 2..15 are implied in range usage 108 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 109 DEF_YMM_OFFS(0), 110 DEF_YMM_OFFS(1), 111 // 2..15 are implied in range usage 112 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 113 DEF_OPMASK_OFFS(0), 114 DEF_OPMASK_OFFS(1), 115 // 2..7 are implied in range usage 116 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 117 DEF_ZMM_OFFS(0), 118 DEF_ZMM_OFFS(1), 119 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt, 120 DEF_ZMM_UPPER_OFFS(16), 121 DEF_ZMM_UPPER_OFFS(17), 122 // 18..31 are implied in range usage 123 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt), 124 fpu_stateH_end, 125 r15_off, r15H_off, 126 r14_off, r14H_off, 127 r13_off, r13H_off, 128 r12_off, r12H_off, 129 r11_off, r11H_off, 130 r10_off, r10H_off, 131 r9_off, r9H_off, 132 r8_off, r8H_off, 133 rdi_off, rdiH_off, 134 rsi_off, rsiH_off, 135 ignore_off, ignoreH_off, // extra copy of rbp 136 rsp_off, rspH_off, 137 rbx_off, rbxH_off, 138 rdx_off, rdxH_off, 139 rcx_off, rcxH_off, 140 rax_off, raxH_off, 141 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state 142 align_off, alignH_off, 143 flags_off, flagsH_off, 144 // The frame sender code expects that rbp will be in the "natural" place and 145 // will override any oopMap setting for it. We must therefore force the layout 146 // so that it agrees with the frame sender code. 147 rbp_off, rbpH_off, // copy of rbp we will restore 148 return_off, returnH_off, // slot for return address 149 reg_save_size // size in compiler stack slots 150 }; 151 152 public: 153 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors); 154 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false); 155 156 // Offsets into the register save area 157 // Used by deoptimization when it is managing result register 158 // values on its own 159 160 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; } 161 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; } 162 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; } 163 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; } 164 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; } 165 166 // During deoptimization only the result registers need to be restored, 167 // all the other values have already been extracted. 168 static void restore_result_registers(MacroAssembler* masm); 169 }; 170 171 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) { 172 int off = 0; 173 int num_xmm_regs = XMMRegisterImpl::number_of_registers; 174 if (UseAVX < 3) { 175 num_xmm_regs = num_xmm_regs/2; 176 } 177 #if COMPILER2_OR_JVMCI 178 if (save_wide_vectors && UseAVX == 0) { 179 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX 180 } 181 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 182 #else 183 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI 184 #endif 185 186 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated 187 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs); 188 // OopMap frame size is in compiler stack slots (jint's) not bytes or words 189 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 190 // CodeBlob frame size is in words. 191 int frame_size_in_words = frame_size_in_bytes / wordSize; 192 *total_frame_words = frame_size_in_words; 193 194 // Save registers, fpu state, and flags. 195 // We assume caller has already pushed the return address onto the 196 // stack, so rsp is 8-byte aligned here. 197 // We push rpb twice in this sequence because we want the real rbp 198 // to be under the return like a normal enter. 199 200 __ enter(); // rsp becomes 16-byte aligned here 201 __ push_CPU_state(); // Push a multiple of 16 bytes 202 203 // push cpu state handles this on EVEX enabled targets 204 if (save_wide_vectors) { 205 // Save upper half of YMM registers(0..15) 206 int base_addr = XSAVE_AREA_YMM_BEGIN; 207 for (int n = 0; n < 16; n++) { 208 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n)); 209 } 210 if (VM_Version::supports_evex()) { 211 // Save upper half of ZMM registers(0..15) 212 base_addr = XSAVE_AREA_ZMM_BEGIN; 213 for (int n = 0; n < 16; n++) { 214 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n)); 215 } 216 // Save full ZMM registers(16..num_xmm_regs) 217 base_addr = XSAVE_AREA_UPPERBANK; 218 off = 0; 219 int vector_len = Assembler::AVX_512bit; 220 for (int n = 16; n < num_xmm_regs; n++) { 221 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 222 } 223 #if COMPILER2_OR_JVMCI 224 base_addr = XSAVE_AREA_OPMASK_BEGIN; 225 off = 0; 226 for(int n = 0; n < KRegisterImpl::number_of_registers; n++) { 227 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 228 } 229 #endif 230 } 231 } else { 232 if (VM_Version::supports_evex()) { 233 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 234 int base_addr = XSAVE_AREA_UPPERBANK; 235 off = 0; 236 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 237 for (int n = 16; n < num_xmm_regs; n++) { 238 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 239 } 240 #if COMPILER2_OR_JVMCI 241 base_addr = XSAVE_AREA_OPMASK_BEGIN; 242 off = 0; 243 for(int n = 0; n < KRegisterImpl::number_of_registers; n++) { 244 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 245 } 246 #endif 247 } 248 } 249 __ vzeroupper(); 250 if (frame::arg_reg_save_area_bytes != 0) { 251 // Allocate argument register save area 252 __ subptr(rsp, frame::arg_reg_save_area_bytes); 253 } 254 255 // Set an oopmap for the call site. This oopmap will map all 256 // oop-registers and debug-info registers as callee-saved. This 257 // will allow deoptimization at this safepoint to find all possible 258 // debug-info recordings, as well as let GC find all oops. 259 260 OopMapSet *oop_maps = new OopMapSet(); 261 OopMap* map = new OopMap(frame_size_in_slots, 0); 262 263 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x)) 264 265 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg()); 266 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg()); 267 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg()); 268 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg()); 269 // rbp location is known implicitly by the frame sender code, needs no oopmap 270 // and the location where rbp was saved by is ignored 271 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg()); 272 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg()); 273 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg()); 274 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg()); 275 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg()); 276 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg()); 277 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg()); 278 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg()); 279 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg()); 280 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg()); 281 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 282 // on EVEX enabled targets, we get it included in the xsave area 283 off = xmm0_off; 284 int delta = xmm1_off - off; 285 for (int n = 0; n < 16; n++) { 286 XMMRegister xmm_name = as_XMMRegister(n); 287 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()); 288 off += delta; 289 } 290 if (UseAVX > 2) { 291 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 292 off = zmm16_off; 293 delta = zmm17_off - off; 294 for (int n = 16; n < num_xmm_regs; n++) { 295 XMMRegister zmm_name = as_XMMRegister(n); 296 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()); 297 off += delta; 298 } 299 } 300 301 #if COMPILER2_OR_JVMCI 302 if (save_wide_vectors) { 303 // Save upper half of YMM registers(0..15) 304 off = ymm0_off; 305 delta = ymm1_off - ymm0_off; 306 for (int n = 0; n < 16; n++) { 307 XMMRegister ymm_name = as_XMMRegister(n); 308 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4)); 309 off += delta; 310 } 311 if (VM_Version::supports_evex()) { 312 // Save upper half of ZMM registers(0..15) 313 off = zmm0_off; 314 delta = zmm1_off - zmm0_off; 315 for (int n = 0; n < 16; n++) { 316 XMMRegister zmm_name = as_XMMRegister(n); 317 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8)); 318 off += delta; 319 } 320 } 321 } 322 #endif // COMPILER2_OR_JVMCI 323 324 // %%% These should all be a waste but we'll keep things as they were for now 325 if (true) { 326 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next()); 327 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next()); 328 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next()); 329 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next()); 330 // rbp location is known implicitly by the frame sender code, needs no oopmap 331 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next()); 332 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next()); 333 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next()); 334 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next()); 335 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next()); 336 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next()); 337 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next()); 338 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next()); 339 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next()); 340 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next()); 341 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 342 // on EVEX enabled targets, we get it included in the xsave area 343 off = xmm0H_off; 344 delta = xmm1H_off - off; 345 for (int n = 0; n < 16; n++) { 346 XMMRegister xmm_name = as_XMMRegister(n); 347 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next()); 348 off += delta; 349 } 350 if (UseAVX > 2) { 351 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 352 off = zmm16H_off; 353 delta = zmm17H_off - off; 354 for (int n = 16; n < num_xmm_regs; n++) { 355 XMMRegister zmm_name = as_XMMRegister(n); 356 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next()); 357 off += delta; 358 } 359 } 360 } 361 362 return map; 363 } 364 365 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) { 366 int num_xmm_regs = XMMRegisterImpl::number_of_registers; 367 if (UseAVX < 3) { 368 num_xmm_regs = num_xmm_regs/2; 369 } 370 if (frame::arg_reg_save_area_bytes != 0) { 371 // Pop arg register save area 372 __ addptr(rsp, frame::arg_reg_save_area_bytes); 373 } 374 375 #if COMPILER2_OR_JVMCI 376 if (restore_wide_vectors) { 377 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); 378 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 379 } 380 #else 381 assert(!restore_wide_vectors, "vectors are generated only by C2"); 382 #endif 383 384 __ vzeroupper(); 385 386 // On EVEX enabled targets everything is handled in pop fpu state 387 if (restore_wide_vectors) { 388 // Restore upper half of YMM registers (0..15) 389 int base_addr = XSAVE_AREA_YMM_BEGIN; 390 for (int n = 0; n < 16; n++) { 391 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16)); 392 } 393 if (VM_Version::supports_evex()) { 394 // Restore upper half of ZMM registers (0..15) 395 base_addr = XSAVE_AREA_ZMM_BEGIN; 396 for (int n = 0; n < 16; n++) { 397 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32)); 398 } 399 // Restore full ZMM registers(16..num_xmm_regs) 400 base_addr = XSAVE_AREA_UPPERBANK; 401 int vector_len = Assembler::AVX_512bit; 402 int off = 0; 403 for (int n = 16; n < num_xmm_regs; n++) { 404 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 405 } 406 #if COMPILER2_OR_JVMCI 407 base_addr = XSAVE_AREA_OPMASK_BEGIN; 408 off = 0; 409 for (int n = 0; n < KRegisterImpl::number_of_registers; n++) { 410 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 411 } 412 #endif 413 } 414 } else { 415 if (VM_Version::supports_evex()) { 416 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 417 int base_addr = XSAVE_AREA_UPPERBANK; 418 int off = 0; 419 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 420 for (int n = 16; n < num_xmm_regs; n++) { 421 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 422 } 423 #if COMPILER2_OR_JVMCI 424 base_addr = XSAVE_AREA_OPMASK_BEGIN; 425 off = 0; 426 for (int n = 0; n < KRegisterImpl::number_of_registers; n++) { 427 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 428 } 429 #endif 430 } 431 } 432 433 // Recover CPU state 434 __ pop_CPU_state(); 435 // Get the rbp described implicitly by the calling convention (no oopMap) 436 __ pop(rbp); 437 } 438 439 void RegisterSaver::restore_result_registers(MacroAssembler* masm) { 440 441 // Just restore result register. Only used by deoptimization. By 442 // now any callee save register that needs to be restored to a c2 443 // caller of the deoptee has been extracted into the vframeArray 444 // and will be stuffed into the c2i adapter we create for later 445 // restoration so only result registers need to be restored here. 446 447 // Restore fp result register 448 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes())); 449 // Restore integer result register 450 __ movptr(rax, Address(rsp, rax_offset_in_bytes())); 451 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes())); 452 453 // Pop all of the register save are off the stack except the return address 454 __ addptr(rsp, return_offset_in_bytes()); 455 } 456 457 // Is vector's size (in bytes) bigger than a size saved by default? 458 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions. 459 bool SharedRuntime::is_wide_vector(int size) { 460 return size > 16; 461 } 462 463 // --------------------------------------------------------------------------- 464 // Read the array of BasicTypes from a signature, and compute where the 465 // arguments should go. Values in the VMRegPair regs array refer to 4-byte 466 // quantities. Values less than VMRegImpl::stack0 are registers, those above 467 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer 468 // as framesizes are fixed. 469 // VMRegImpl::stack0 refers to the first slot 0(sp). 470 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher. Register 471 // up to RegisterImpl::number_of_registers) are the 64-bit 472 // integer registers. 473 474 // Note: the INPUTS in sig_bt are in units of Java argument words, which are 475 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit 476 // units regardless of build. Of course for i486 there is no 64 bit build 477 478 // The Java calling convention is a "shifted" version of the C ABI. 479 // By skipping the first C ABI register we can call non-static jni methods 480 // with small numbers of arguments without having to shuffle the arguments 481 // at all. Since we control the java ABI we ought to at least get some 482 // advantage out of it. 483 484 int SharedRuntime::java_calling_convention(const BasicType *sig_bt, 485 VMRegPair *regs, 486 int total_args_passed) { 487 488 // Create the mapping between argument positions and 489 // registers. 490 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = { 491 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5 492 }; 493 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = { 494 j_farg0, j_farg1, j_farg2, j_farg3, 495 j_farg4, j_farg5, j_farg6, j_farg7 496 }; 497 498 499 uint int_args = 0; 500 uint fp_args = 0; 501 uint stk_args = 0; // inc by 2 each time 502 503 for (int i = 0; i < total_args_passed; i++) { 504 switch (sig_bt[i]) { 505 case T_BOOLEAN: 506 case T_CHAR: 507 case T_BYTE: 508 case T_SHORT: 509 case T_INT: 510 if (int_args < Argument::n_int_register_parameters_j) { 511 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 512 } else { 513 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 514 stk_args += 2; 515 } 516 break; 517 case T_VOID: 518 // halves of T_LONG or T_DOUBLE 519 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 520 regs[i].set_bad(); 521 break; 522 case T_LONG: 523 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 524 // fall through 525 case T_OBJECT: 526 case T_ARRAY: 527 case T_ADDRESS: 528 if (int_args < Argument::n_int_register_parameters_j) { 529 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 530 } else { 531 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 532 stk_args += 2; 533 } 534 break; 535 case T_FLOAT: 536 if (fp_args < Argument::n_float_register_parameters_j) { 537 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 538 } else { 539 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 540 stk_args += 2; 541 } 542 break; 543 case T_DOUBLE: 544 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 545 if (fp_args < Argument::n_float_register_parameters_j) { 546 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 547 } else { 548 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 549 stk_args += 2; 550 } 551 break; 552 default: 553 ShouldNotReachHere(); 554 break; 555 } 556 } 557 558 return align_up(stk_args, 2); 559 } 560 561 // Patch the callers callsite with entry to compiled code if it exists. 562 static void patch_callers_callsite(MacroAssembler *masm) { 563 Label L; 564 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD); 565 __ jcc(Assembler::equal, L); 566 567 // Save the current stack pointer 568 __ mov(r13, rsp); 569 // Schedule the branch target address early. 570 // Call into the VM to patch the caller, then jump to compiled callee 571 // rax isn't live so capture return address while we easily can 572 __ movptr(rax, Address(rsp, 0)); 573 574 // align stack so push_CPU_state doesn't fault 575 __ andptr(rsp, -(StackAlignmentInBytes)); 576 __ push_CPU_state(); 577 __ vzeroupper(); 578 // VM needs caller's callsite 579 // VM needs target method 580 // This needs to be a long call since we will relocate this adapter to 581 // the codeBuffer and it may not reach 582 583 // Allocate argument register save area 584 if (frame::arg_reg_save_area_bytes != 0) { 585 __ subptr(rsp, frame::arg_reg_save_area_bytes); 586 } 587 __ mov(c_rarg0, rbx); 588 __ mov(c_rarg1, rax); 589 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite))); 590 591 // De-allocate argument register save area 592 if (frame::arg_reg_save_area_bytes != 0) { 593 __ addptr(rsp, frame::arg_reg_save_area_bytes); 594 } 595 596 __ vzeroupper(); 597 __ pop_CPU_state(); 598 // restore sp 599 __ mov(rsp, r13); 600 __ bind(L); 601 } 602 603 604 static void gen_c2i_adapter(MacroAssembler *masm, 605 int total_args_passed, 606 int comp_args_on_stack, 607 const BasicType *sig_bt, 608 const VMRegPair *regs, 609 Label& skip_fixup) { 610 // Before we get into the guts of the C2I adapter, see if we should be here 611 // at all. We've come from compiled code and are attempting to jump to the 612 // interpreter, which means the caller made a static call to get here 613 // (vcalls always get a compiled target if there is one). Check for a 614 // compiled target. If there is one, we need to patch the caller's call. 615 patch_callers_callsite(masm); 616 617 __ bind(skip_fixup); 618 619 // Since all args are passed on the stack, total_args_passed * 620 // Interpreter::stackElementSize is the space we need. Plus 1 because 621 // we also account for the return address location since 622 // we store it first rather than hold it in rax across all the shuffling 623 624 int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize; 625 626 // stack is aligned, keep it that way 627 extraspace = align_up(extraspace, 2*wordSize); 628 629 // Get return address 630 __ pop(rax); 631 632 // set senderSP value 633 __ mov(r13, rsp); 634 635 __ subptr(rsp, extraspace); 636 637 // Store the return address in the expected location 638 __ movptr(Address(rsp, 0), rax); 639 640 // Now write the args into the outgoing interpreter space 641 for (int i = 0; i < total_args_passed; i++) { 642 if (sig_bt[i] == T_VOID) { 643 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 644 continue; 645 } 646 647 // offset to start parameters 648 int st_off = (total_args_passed - i) * Interpreter::stackElementSize; 649 int next_off = st_off - Interpreter::stackElementSize; 650 651 // Say 4 args: 652 // i st_off 653 // 0 32 T_LONG 654 // 1 24 T_VOID 655 // 2 16 T_OBJECT 656 // 3 8 T_BOOL 657 // - 0 return address 658 // 659 // However to make thing extra confusing. Because we can fit a long/double in 660 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter 661 // leaves one slot empty and only stores to a single slot. In this case the 662 // slot that is occupied is the T_VOID slot. See I said it was confusing. 663 664 VMReg r_1 = regs[i].first(); 665 VMReg r_2 = regs[i].second(); 666 if (!r_1->is_valid()) { 667 assert(!r_2->is_valid(), ""); 668 continue; 669 } 670 if (r_1->is_stack()) { 671 // memory to memory use rax 672 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace; 673 if (!r_2->is_valid()) { 674 // sign extend?? 675 __ movl(rax, Address(rsp, ld_off)); 676 __ movptr(Address(rsp, st_off), rax); 677 678 } else { 679 680 __ movq(rax, Address(rsp, ld_off)); 681 682 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 683 // T_DOUBLE and T_LONG use two slots in the interpreter 684 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 685 // ld_off == LSW, ld_off+wordSize == MSW 686 // st_off == MSW, next_off == LSW 687 __ movq(Address(rsp, next_off), rax); 688 #ifdef ASSERT 689 // Overwrite the unused slot with known junk 690 __ mov64(rax, CONST64(0xdeadffffdeadaaaa)); 691 __ movptr(Address(rsp, st_off), rax); 692 #endif /* ASSERT */ 693 } else { 694 __ movq(Address(rsp, st_off), rax); 695 } 696 } 697 } else if (r_1->is_Register()) { 698 Register r = r_1->as_Register(); 699 if (!r_2->is_valid()) { 700 // must be only an int (or less ) so move only 32bits to slot 701 // why not sign extend?? 702 __ movl(Address(rsp, st_off), r); 703 } else { 704 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 705 // T_DOUBLE and T_LONG use two slots in the interpreter 706 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 707 // long/double in gpr 708 #ifdef ASSERT 709 // Overwrite the unused slot with known junk 710 __ mov64(rax, CONST64(0xdeadffffdeadaaab)); 711 __ movptr(Address(rsp, st_off), rax); 712 #endif /* ASSERT */ 713 __ movq(Address(rsp, next_off), r); 714 } else { 715 __ movptr(Address(rsp, st_off), r); 716 } 717 } 718 } else { 719 assert(r_1->is_XMMRegister(), ""); 720 if (!r_2->is_valid()) { 721 // only a float use just part of the slot 722 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister()); 723 } else { 724 #ifdef ASSERT 725 // Overwrite the unused slot with known junk 726 __ mov64(rax, CONST64(0xdeadffffdeadaaac)); 727 __ movptr(Address(rsp, st_off), rax); 728 #endif /* ASSERT */ 729 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister()); 730 } 731 } 732 } 733 734 // Schedule the branch target address early. 735 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset()))); 736 __ jmp(rcx); 737 } 738 739 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg, 740 address code_start, address code_end, 741 Label& L_ok) { 742 Label L_fail; 743 __ lea(temp_reg, ExternalAddress(code_start)); 744 __ cmpptr(pc_reg, temp_reg); 745 __ jcc(Assembler::belowEqual, L_fail); 746 __ lea(temp_reg, ExternalAddress(code_end)); 747 __ cmpptr(pc_reg, temp_reg); 748 __ jcc(Assembler::below, L_ok); 749 __ bind(L_fail); 750 } 751 752 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm, 753 int total_args_passed, 754 int comp_args_on_stack, 755 const BasicType *sig_bt, 756 const VMRegPair *regs) { 757 758 // Note: r13 contains the senderSP on entry. We must preserve it since 759 // we may do a i2c -> c2i transition if we lose a race where compiled 760 // code goes non-entrant while we get args ready. 761 // In addition we use r13 to locate all the interpreter args as 762 // we must align the stack to 16 bytes on an i2c entry else we 763 // lose alignment we expect in all compiled code and register 764 // save code can segv when fxsave instructions find improperly 765 // aligned stack pointer. 766 767 // Adapters can be frameless because they do not require the caller 768 // to perform additional cleanup work, such as correcting the stack pointer. 769 // An i2c adapter is frameless because the *caller* frame, which is interpreted, 770 // routinely repairs its own stack pointer (from interpreter_frame_last_sp), 771 // even if a callee has modified the stack pointer. 772 // A c2i adapter is frameless because the *callee* frame, which is interpreted, 773 // routinely repairs its caller's stack pointer (from sender_sp, which is set 774 // up via the senderSP register). 775 // In other words, if *either* the caller or callee is interpreted, we can 776 // get the stack pointer repaired after a call. 777 // This is why c2i and i2c adapters cannot be indefinitely composed. 778 // In particular, if a c2i adapter were to somehow call an i2c adapter, 779 // both caller and callee would be compiled methods, and neither would 780 // clean up the stack pointer changes performed by the two adapters. 781 // If this happens, control eventually transfers back to the compiled 782 // caller, but with an uncorrected stack, causing delayed havoc. 783 784 // Pick up the return address 785 __ movptr(rax, Address(rsp, 0)); 786 787 if (VerifyAdapterCalls && 788 (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) { 789 // So, let's test for cascading c2i/i2c adapters right now. 790 // assert(Interpreter::contains($return_addr) || 791 // StubRoutines::contains($return_addr), 792 // "i2c adapter must return to an interpreter frame"); 793 __ block_comment("verify_i2c { "); 794 Label L_ok; 795 if (Interpreter::code() != NULL) 796 range_check(masm, rax, r11, 797 Interpreter::code()->code_start(), Interpreter::code()->code_end(), 798 L_ok); 799 if (StubRoutines::code1() != NULL) 800 range_check(masm, rax, r11, 801 StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(), 802 L_ok); 803 if (StubRoutines::code2() != NULL) 804 range_check(masm, rax, r11, 805 StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(), 806 L_ok); 807 const char* msg = "i2c adapter must return to an interpreter frame"; 808 __ block_comment(msg); 809 __ stop(msg); 810 __ bind(L_ok); 811 __ block_comment("} verify_i2ce "); 812 } 813 814 // Must preserve original SP for loading incoming arguments because 815 // we need to align the outgoing SP for compiled code. 816 __ movptr(r11, rsp); 817 818 // Cut-out for having no stack args. Since up to 2 int/oop args are passed 819 // in registers, we will occasionally have no stack args. 820 int comp_words_on_stack = 0; 821 if (comp_args_on_stack) { 822 // Sig words on the stack are greater-than VMRegImpl::stack0. Those in 823 // registers are below. By subtracting stack0, we either get a negative 824 // number (all values in registers) or the maximum stack slot accessed. 825 826 // Convert 4-byte c2 stack slots to words. 827 comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord; 828 // Round up to miminum stack alignment, in wordSize 829 comp_words_on_stack = align_up(comp_words_on_stack, 2); 830 __ subptr(rsp, comp_words_on_stack * wordSize); 831 } 832 833 834 // Ensure compiled code always sees stack at proper alignment 835 __ andptr(rsp, -16); 836 837 // push the return address and misalign the stack that youngest frame always sees 838 // as far as the placement of the call instruction 839 __ push(rax); 840 841 // Put saved SP in another register 842 const Register saved_sp = rax; 843 __ movptr(saved_sp, r11); 844 845 // Will jump to the compiled code just as if compiled code was doing it. 846 // Pre-load the register-jump target early, to schedule it better. 847 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset()))); 848 849 #if INCLUDE_JVMCI 850 if (EnableJVMCI) { 851 // check if this call should be routed towards a specific entry point 852 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 853 Label no_alternative_target; 854 __ jcc(Assembler::equal, no_alternative_target); 855 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset()))); 856 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 857 __ bind(no_alternative_target); 858 } 859 #endif // INCLUDE_JVMCI 860 861 // Now generate the shuffle code. Pick up all register args and move the 862 // rest through the floating point stack top. 863 for (int i = 0; i < total_args_passed; i++) { 864 if (sig_bt[i] == T_VOID) { 865 // Longs and doubles are passed in native word order, but misaligned 866 // in the 32-bit build. 867 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 868 continue; 869 } 870 871 // Pick up 0, 1 or 2 words from SP+offset. 872 873 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), 874 "scrambled load targets?"); 875 // Load in argument order going down. 876 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize; 877 // Point to interpreter value (vs. tag) 878 int next_off = ld_off - Interpreter::stackElementSize; 879 // 880 // 881 // 882 VMReg r_1 = regs[i].first(); 883 VMReg r_2 = regs[i].second(); 884 if (!r_1->is_valid()) { 885 assert(!r_2->is_valid(), ""); 886 continue; 887 } 888 if (r_1->is_stack()) { 889 // Convert stack slot to an SP offset (+ wordSize to account for return address ) 890 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize; 891 892 // We can use r13 as a temp here because compiled code doesn't need r13 as an input 893 // and if we end up going thru a c2i because of a miss a reasonable value of r13 894 // will be generated. 895 if (!r_2->is_valid()) { 896 // sign extend??? 897 __ movl(r13, Address(saved_sp, ld_off)); 898 __ movptr(Address(rsp, st_off), r13); 899 } else { 900 // 901 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 902 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 903 // So we must adjust where to pick up the data to match the interpreter. 904 // 905 // Interpreter local[n] == MSW, local[n+1] == LSW however locals 906 // are accessed as negative so LSW is at LOW address 907 908 // ld_off is MSW so get LSW 909 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 910 next_off : ld_off; 911 __ movq(r13, Address(saved_sp, offset)); 912 // st_off is LSW (i.e. reg.first()) 913 __ movq(Address(rsp, st_off), r13); 914 } 915 } else if (r_1->is_Register()) { // Register argument 916 Register r = r_1->as_Register(); 917 assert(r != rax, "must be different"); 918 if (r_2->is_valid()) { 919 // 920 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 921 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 922 // So we must adjust where to pick up the data to match the interpreter. 923 924 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 925 next_off : ld_off; 926 927 // this can be a misaligned move 928 __ movq(r, Address(saved_sp, offset)); 929 } else { 930 // sign extend and use a full word? 931 __ movl(r, Address(saved_sp, ld_off)); 932 } 933 } else { 934 if (!r_2->is_valid()) { 935 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off)); 936 } else { 937 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off)); 938 } 939 } 940 } 941 942 // 6243940 We might end up in handle_wrong_method if 943 // the callee is deoptimized as we race thru here. If that 944 // happens we don't want to take a safepoint because the 945 // caller frame will look interpreted and arguments are now 946 // "compiled" so it is much better to make this transition 947 // invisible to the stack walking code. Unfortunately if 948 // we try and find the callee by normal means a safepoint 949 // is possible. So we stash the desired callee in the thread 950 // and the vm will find there should this case occur. 951 952 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx); 953 954 // put Method* where a c2i would expect should we end up there 955 // only needed becaus eof c2 resolve stubs return Method* as a result in 956 // rax 957 __ mov(rax, rbx); 958 __ jmp(r11); 959 } 960 961 // --------------------------------------------------------------- 962 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm, 963 int total_args_passed, 964 int comp_args_on_stack, 965 const BasicType *sig_bt, 966 const VMRegPair *regs, 967 AdapterFingerPrint* fingerprint) { 968 address i2c_entry = __ pc(); 969 970 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs); 971 972 // ------------------------------------------------------------------------- 973 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls 974 // to the interpreter. The args start out packed in the compiled layout. They 975 // need to be unpacked into the interpreter layout. This will almost always 976 // require some stack space. We grow the current (compiled) stack, then repack 977 // the args. We finally end in a jump to the generic interpreter entry point. 978 // On exit from the interpreter, the interpreter will restore our SP (lest the 979 // compiled code, which relys solely on SP and not RBP, get sick). 980 981 address c2i_unverified_entry = __ pc(); 982 Label skip_fixup; 983 Label ok; 984 985 Register holder = rax; 986 Register receiver = j_rarg0; 987 Register temp = rbx; 988 989 { 990 __ load_klass(temp, receiver, rscratch1); 991 __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset())); 992 __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset())); 993 __ jcc(Assembler::equal, ok); 994 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 995 996 __ bind(ok); 997 // Method might have been compiled since the call site was patched to 998 // interpreted if that is the case treat it as a miss so we can get 999 // the call site corrected. 1000 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD); 1001 __ jcc(Assembler::equal, skip_fixup); 1002 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1003 } 1004 1005 address c2i_entry = __ pc(); 1006 1007 // Class initialization barrier for static methods 1008 address c2i_no_clinit_check_entry = NULL; 1009 if (VM_Version::supports_fast_class_init_checks()) { 1010 Label L_skip_barrier; 1011 Register method = rbx; 1012 1013 { // Bypass the barrier for non-static methods 1014 Register flags = rscratch1; 1015 __ movl(flags, Address(method, Method::access_flags_offset())); 1016 __ testl(flags, JVM_ACC_STATIC); 1017 __ jcc(Assembler::zero, L_skip_barrier); // non-static 1018 } 1019 1020 Register klass = rscratch1; 1021 __ load_method_holder(klass, method); 1022 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1023 1024 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1025 1026 __ bind(L_skip_barrier); 1027 c2i_no_clinit_check_entry = __ pc(); 1028 } 1029 1030 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1031 bs->c2i_entry_barrier(masm); 1032 1033 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup); 1034 1035 __ flush(); 1036 return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry); 1037 } 1038 1039 int SharedRuntime::c_calling_convention(const BasicType *sig_bt, 1040 VMRegPair *regs, 1041 VMRegPair *regs2, 1042 int total_args_passed) { 1043 assert(regs2 == NULL, "not needed on x86"); 1044 // We return the amount of VMRegImpl stack slots we need to reserve for all 1045 // the arguments NOT counting out_preserve_stack_slots. 1046 1047 // NOTE: These arrays will have to change when c1 is ported 1048 #ifdef _WIN64 1049 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1050 c_rarg0, c_rarg1, c_rarg2, c_rarg3 1051 }; 1052 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1053 c_farg0, c_farg1, c_farg2, c_farg3 1054 }; 1055 #else 1056 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1057 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5 1058 }; 1059 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1060 c_farg0, c_farg1, c_farg2, c_farg3, 1061 c_farg4, c_farg5, c_farg6, c_farg7 1062 }; 1063 #endif // _WIN64 1064 1065 1066 uint int_args = 0; 1067 uint fp_args = 0; 1068 uint stk_args = 0; // inc by 2 each time 1069 1070 for (int i = 0; i < total_args_passed; i++) { 1071 switch (sig_bt[i]) { 1072 case T_BOOLEAN: 1073 case T_CHAR: 1074 case T_BYTE: 1075 case T_SHORT: 1076 case T_INT: 1077 if (int_args < Argument::n_int_register_parameters_c) { 1078 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 1079 #ifdef _WIN64 1080 fp_args++; 1081 // Allocate slots for callee to stuff register args the stack. 1082 stk_args += 2; 1083 #endif 1084 } else { 1085 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1086 stk_args += 2; 1087 } 1088 break; 1089 case T_LONG: 1090 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1091 // fall through 1092 case T_OBJECT: 1093 case T_ARRAY: 1094 case T_ADDRESS: 1095 case T_METADATA: 1096 if (int_args < Argument::n_int_register_parameters_c) { 1097 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 1098 #ifdef _WIN64 1099 fp_args++; 1100 stk_args += 2; 1101 #endif 1102 } else { 1103 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1104 stk_args += 2; 1105 } 1106 break; 1107 case T_FLOAT: 1108 if (fp_args < Argument::n_float_register_parameters_c) { 1109 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 1110 #ifdef _WIN64 1111 int_args++; 1112 // Allocate slots for callee to stuff register args the stack. 1113 stk_args += 2; 1114 #endif 1115 } else { 1116 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1117 stk_args += 2; 1118 } 1119 break; 1120 case T_DOUBLE: 1121 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1122 if (fp_args < Argument::n_float_register_parameters_c) { 1123 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 1124 #ifdef _WIN64 1125 int_args++; 1126 // Allocate slots for callee to stuff register args the stack. 1127 stk_args += 2; 1128 #endif 1129 } else { 1130 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1131 stk_args += 2; 1132 } 1133 break; 1134 case T_VOID: // Halves of longs and doubles 1135 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 1136 regs[i].set_bad(); 1137 break; 1138 default: 1139 ShouldNotReachHere(); 1140 break; 1141 } 1142 } 1143 #ifdef _WIN64 1144 // windows abi requires that we always allocate enough stack space 1145 // for 4 64bit registers to be stored down. 1146 if (stk_args < 8) { 1147 stk_args = 8; 1148 } 1149 #endif // _WIN64 1150 1151 return stk_args; 1152 } 1153 1154 int SharedRuntime::vector_calling_convention(VMRegPair *regs, 1155 uint num_bits, 1156 uint total_args_passed) { 1157 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512, 1158 "only certain vector sizes are supported for now"); 1159 1160 static const XMMRegister VEC_ArgReg[32] = { 1161 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 1162 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1163 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23, 1164 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31 1165 }; 1166 1167 uint stk_args = 0; 1168 uint fp_args = 0; 1169 1170 for (uint i = 0; i < total_args_passed; i++) { 1171 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg(); 1172 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15)); 1173 regs[i].set_pair(vmreg->next(next_val), vmreg); 1174 } 1175 1176 return stk_args; 1177 } 1178 1179 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1180 // We always ignore the frame_slots arg and just use the space just below frame pointer 1181 // which by this time is free to use 1182 switch (ret_type) { 1183 case T_FLOAT: 1184 __ movflt(Address(rbp, -wordSize), xmm0); 1185 break; 1186 case T_DOUBLE: 1187 __ movdbl(Address(rbp, -wordSize), xmm0); 1188 break; 1189 case T_VOID: break; 1190 default: { 1191 __ movptr(Address(rbp, -wordSize), rax); 1192 } 1193 } 1194 } 1195 1196 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1197 // We always ignore the frame_slots arg and just use the space just below frame pointer 1198 // which by this time is free to use 1199 switch (ret_type) { 1200 case T_FLOAT: 1201 __ movflt(xmm0, Address(rbp, -wordSize)); 1202 break; 1203 case T_DOUBLE: 1204 __ movdbl(xmm0, Address(rbp, -wordSize)); 1205 break; 1206 case T_VOID: break; 1207 default: { 1208 __ movptr(rax, Address(rbp, -wordSize)); 1209 } 1210 } 1211 } 1212 1213 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1214 for ( int i = first_arg ; i < arg_count ; i++ ) { 1215 if (args[i].first()->is_Register()) { 1216 __ push(args[i].first()->as_Register()); 1217 } else if (args[i].first()->is_XMMRegister()) { 1218 __ subptr(rsp, 2*wordSize); 1219 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister()); 1220 } 1221 } 1222 } 1223 1224 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1225 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) { 1226 if (args[i].first()->is_Register()) { 1227 __ pop(args[i].first()->as_Register()); 1228 } else if (args[i].first()->is_XMMRegister()) { 1229 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0)); 1230 __ addptr(rsp, 2*wordSize); 1231 } 1232 } 1233 } 1234 1235 // Unpack an array argument into a pointer to the body and the length 1236 // if the array is non-null, otherwise pass 0 for both. 1237 static void unpack_array_argument(MacroAssembler* masm, VMRegPair reg, BasicType in_elem_type, VMRegPair body_arg, VMRegPair length_arg) { 1238 Register tmp_reg = rax; 1239 assert(!body_arg.first()->is_Register() || body_arg.first()->as_Register() != tmp_reg, 1240 "possible collision"); 1241 assert(!length_arg.first()->is_Register() || length_arg.first()->as_Register() != tmp_reg, 1242 "possible collision"); 1243 1244 __ block_comment("unpack_array_argument {"); 1245 1246 // Pass the length, ptr pair 1247 Label is_null, done; 1248 VMRegPair tmp; 1249 tmp.set_ptr(tmp_reg->as_VMReg()); 1250 if (reg.first()->is_stack()) { 1251 // Load the arg up from the stack 1252 __ move_ptr(reg, tmp); 1253 reg = tmp; 1254 } 1255 __ testptr(reg.first()->as_Register(), reg.first()->as_Register()); 1256 __ jccb(Assembler::equal, is_null); 1257 __ lea(tmp_reg, Address(reg.first()->as_Register(), arrayOopDesc::base_offset_in_bytes(in_elem_type))); 1258 __ move_ptr(tmp, body_arg); 1259 // load the length relative to the body. 1260 __ movl(tmp_reg, Address(tmp_reg, arrayOopDesc::length_offset_in_bytes() - 1261 arrayOopDesc::base_offset_in_bytes(in_elem_type))); 1262 __ move32_64(tmp, length_arg); 1263 __ jmpb(done); 1264 __ bind(is_null); 1265 // Pass zeros 1266 __ xorptr(tmp_reg, tmp_reg); 1267 __ move_ptr(tmp, body_arg); 1268 __ move32_64(tmp, length_arg); 1269 __ bind(done); 1270 1271 __ block_comment("} unpack_array_argument"); 1272 } 1273 1274 1275 // Different signatures may require very different orders for the move 1276 // to avoid clobbering other arguments. There's no simple way to 1277 // order them safely. Compute a safe order for issuing stores and 1278 // break any cycles in those stores. This code is fairly general but 1279 // it's not necessary on the other platforms so we keep it in the 1280 // platform dependent code instead of moving it into a shared file. 1281 // (See bugs 7013347 & 7145024.) 1282 // Note that this code is specific to LP64. 1283 class ComputeMoveOrder: public StackObj { 1284 class MoveOperation: public ResourceObj { 1285 friend class ComputeMoveOrder; 1286 private: 1287 VMRegPair _src; 1288 VMRegPair _dst; 1289 int _src_index; 1290 int _dst_index; 1291 bool _processed; 1292 MoveOperation* _next; 1293 MoveOperation* _prev; 1294 1295 static int get_id(VMRegPair r) { 1296 return r.first()->value(); 1297 } 1298 1299 public: 1300 MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst): 1301 _src(src) 1302 , _dst(dst) 1303 , _src_index(src_index) 1304 , _dst_index(dst_index) 1305 , _processed(false) 1306 , _next(NULL) 1307 , _prev(NULL) { 1308 } 1309 1310 VMRegPair src() const { return _src; } 1311 int src_id() const { return get_id(src()); } 1312 int src_index() const { return _src_index; } 1313 VMRegPair dst() const { return _dst; } 1314 void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; } 1315 int dst_index() const { return _dst_index; } 1316 int dst_id() const { return get_id(dst()); } 1317 MoveOperation* next() const { return _next; } 1318 MoveOperation* prev() const { return _prev; } 1319 void set_processed() { _processed = true; } 1320 bool is_processed() const { return _processed; } 1321 1322 // insert 1323 void break_cycle(VMRegPair temp_register) { 1324 // create a new store following the last store 1325 // to move from the temp_register to the original 1326 MoveOperation* new_store = new MoveOperation(-1, temp_register, dst_index(), dst()); 1327 1328 // break the cycle of links and insert new_store at the end 1329 // break the reverse link. 1330 MoveOperation* p = prev(); 1331 assert(p->next() == this, "must be"); 1332 _prev = NULL; 1333 p->_next = new_store; 1334 new_store->_prev = p; 1335 1336 // change the original store to save it's value in the temp. 1337 set_dst(-1, temp_register); 1338 } 1339 1340 void link(GrowableArray<MoveOperation*>& killer) { 1341 // link this store in front the store that it depends on 1342 MoveOperation* n = killer.at_grow(src_id(), NULL); 1343 if (n != NULL) { 1344 assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet"); 1345 _next = n; 1346 n->_prev = this; 1347 } 1348 } 1349 }; 1350 1351 private: 1352 GrowableArray<MoveOperation*> edges; 1353 1354 public: 1355 ComputeMoveOrder(int total_in_args, const VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs, 1356 const BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) { 1357 // Move operations where the dest is the stack can all be 1358 // scheduled first since they can't interfere with the other moves. 1359 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 1360 if (in_sig_bt[i] == T_ARRAY) { 1361 c_arg--; 1362 if (out_regs[c_arg].first()->is_stack() && 1363 out_regs[c_arg + 1].first()->is_stack()) { 1364 arg_order.push(i); 1365 arg_order.push(c_arg); 1366 } else { 1367 if (out_regs[c_arg].first()->is_stack() || 1368 in_regs[i].first() == out_regs[c_arg].first()) { 1369 add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + 1]); 1370 } else { 1371 add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]); 1372 } 1373 } 1374 } else if (in_sig_bt[i] == T_VOID) { 1375 arg_order.push(i); 1376 arg_order.push(c_arg); 1377 } else { 1378 if (out_regs[c_arg].first()->is_stack() || 1379 in_regs[i].first() == out_regs[c_arg].first()) { 1380 arg_order.push(i); 1381 arg_order.push(c_arg); 1382 } else { 1383 add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]); 1384 } 1385 } 1386 } 1387 // Break any cycles in the register moves and emit the in the 1388 // proper order. 1389 GrowableArray<MoveOperation*>* stores = get_store_order(tmp_vmreg); 1390 for (int i = 0; i < stores->length(); i++) { 1391 arg_order.push(stores->at(i)->src_index()); 1392 arg_order.push(stores->at(i)->dst_index()); 1393 } 1394 } 1395 1396 // Collected all the move operations 1397 void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) { 1398 if (src.first() == dst.first()) return; 1399 edges.append(new MoveOperation(src_index, src, dst_index, dst)); 1400 } 1401 1402 // Walk the edges breaking cycles between moves. The result list 1403 // can be walked in order to produce the proper set of loads 1404 GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) { 1405 // Record which moves kill which values 1406 GrowableArray<MoveOperation*> killer; 1407 for (int i = 0; i < edges.length(); i++) { 1408 MoveOperation* s = edges.at(i); 1409 assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer"); 1410 killer.at_put_grow(s->dst_id(), s, NULL); 1411 } 1412 assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL, 1413 "make sure temp isn't in the registers that are killed"); 1414 1415 // create links between loads and stores 1416 for (int i = 0; i < edges.length(); i++) { 1417 edges.at(i)->link(killer); 1418 } 1419 1420 // at this point, all the move operations are chained together 1421 // in a doubly linked list. Processing it backwards finds 1422 // the beginning of the chain, forwards finds the end. If there's 1423 // a cycle it can be broken at any point, so pick an edge and walk 1424 // backward until the list ends or we end where we started. 1425 GrowableArray<MoveOperation*>* stores = new GrowableArray<MoveOperation*>(); 1426 for (int e = 0; e < edges.length(); e++) { 1427 MoveOperation* s = edges.at(e); 1428 if (!s->is_processed()) { 1429 MoveOperation* start = s; 1430 // search for the beginning of the chain or cycle 1431 while (start->prev() != NULL && start->prev() != s) { 1432 start = start->prev(); 1433 } 1434 if (start->prev() == s) { 1435 start->break_cycle(temp_register); 1436 } 1437 // walk the chain forward inserting to store list 1438 while (start != NULL) { 1439 stores->append(start); 1440 start->set_processed(); 1441 start = start->next(); 1442 } 1443 } 1444 } 1445 return stores; 1446 } 1447 }; 1448 1449 static void verify_oop_args(MacroAssembler* masm, 1450 const methodHandle& method, 1451 const BasicType* sig_bt, 1452 const VMRegPair* regs) { 1453 Register temp_reg = rbx; // not part of any compiled calling seq 1454 if (VerifyOops) { 1455 for (int i = 0; i < method->size_of_parameters(); i++) { 1456 if (is_reference_type(sig_bt[i])) { 1457 VMReg r = regs[i].first(); 1458 assert(r->is_valid(), "bad oop arg"); 1459 if (r->is_stack()) { 1460 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1461 __ verify_oop(temp_reg); 1462 } else { 1463 __ verify_oop(r->as_Register()); 1464 } 1465 } 1466 } 1467 } 1468 } 1469 1470 static void gen_special_dispatch(MacroAssembler* masm, 1471 const methodHandle& method, 1472 const BasicType* sig_bt, 1473 const VMRegPair* regs) { 1474 verify_oop_args(masm, method, sig_bt, regs); 1475 vmIntrinsics::ID iid = method->intrinsic_id(); 1476 1477 // Now write the args into the outgoing interpreter space 1478 bool has_receiver = false; 1479 Register receiver_reg = noreg; 1480 int member_arg_pos = -1; 1481 Register member_reg = noreg; 1482 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid); 1483 if (ref_kind != 0) { 1484 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument 1485 member_reg = rbx; // known to be free at this point 1486 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind); 1487 } else if (iid == vmIntrinsics::_invokeBasic || iid == vmIntrinsics::_linkToNative) { 1488 has_receiver = true; 1489 } else { 1490 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid)); 1491 } 1492 1493 if (member_reg != noreg) { 1494 // Load the member_arg into register, if necessary. 1495 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs); 1496 VMReg r = regs[member_arg_pos].first(); 1497 if (r->is_stack()) { 1498 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1499 } else { 1500 // no data motion is needed 1501 member_reg = r->as_Register(); 1502 } 1503 } 1504 1505 if (has_receiver) { 1506 // Make sure the receiver is loaded into a register. 1507 assert(method->size_of_parameters() > 0, "oob"); 1508 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object"); 1509 VMReg r = regs[0].first(); 1510 assert(r->is_valid(), "bad receiver arg"); 1511 if (r->is_stack()) { 1512 // Porting note: This assumes that compiled calling conventions always 1513 // pass the receiver oop in a register. If this is not true on some 1514 // platform, pick a temp and load the receiver from stack. 1515 fatal("receiver always in a register"); 1516 receiver_reg = j_rarg0; // known to be free at this point 1517 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1518 } else { 1519 // no data motion is needed 1520 receiver_reg = r->as_Register(); 1521 } 1522 } 1523 1524 // Figure out which address we are really jumping to: 1525 MethodHandles::generate_method_handle_dispatch(masm, iid, 1526 receiver_reg, member_reg, /*for_compiler_entry:*/ true); 1527 } 1528 1529 // --------------------------------------------------------------------------- 1530 // Generate a native wrapper for a given method. The method takes arguments 1531 // in the Java compiled code convention, marshals them to the native 1532 // convention (handlizes oops, etc), transitions to native, makes the call, 1533 // returns to java state (possibly blocking), unhandlizes any result and 1534 // returns. 1535 // 1536 // Critical native functions are a shorthand for the use of 1537 // GetPrimtiveArrayCritical and disallow the use of any other JNI 1538 // functions. The wrapper is expected to unpack the arguments before 1539 // passing them to the callee. Critical native functions leave the state _in_Java, 1540 // since they cannot stop for GC. 1541 // Some other parts of JNI setup are skipped like the tear down of the JNI handle 1542 // block and the check for pending exceptions it's impossible for them 1543 // to be thrown. 1544 // 1545 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm, 1546 const methodHandle& method, 1547 int compile_id, 1548 BasicType* in_sig_bt, 1549 VMRegPair* in_regs, 1550 BasicType ret_type, 1551 address critical_entry) { 1552 if (method->is_method_handle_intrinsic()) { 1553 vmIntrinsics::ID iid = method->intrinsic_id(); 1554 intptr_t start = (intptr_t)__ pc(); 1555 int vep_offset = ((intptr_t)__ pc()) - start; 1556 gen_special_dispatch(masm, 1557 method, 1558 in_sig_bt, 1559 in_regs); 1560 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period 1561 __ flush(); 1562 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually 1563 return nmethod::new_native_nmethod(method, 1564 compile_id, 1565 masm->code(), 1566 vep_offset, 1567 frame_complete, 1568 stack_slots / VMRegImpl::slots_per_word, 1569 in_ByteSize(-1), 1570 in_ByteSize(-1), 1571 (OopMapSet*)NULL); 1572 } 1573 bool is_critical_native = true; 1574 address native_func = critical_entry; 1575 if (native_func == NULL) { 1576 native_func = method->native_function(); 1577 is_critical_native = false; 1578 } 1579 assert(native_func != NULL, "must have function"); 1580 1581 // An OopMap for lock (and class if static) 1582 OopMapSet *oop_maps = new OopMapSet(); 1583 intptr_t start = (intptr_t)__ pc(); 1584 1585 // We have received a description of where all the java arg are located 1586 // on entry to the wrapper. We need to convert these args to where 1587 // the jni function will expect them. To figure out where they go 1588 // we convert the java signature to a C signature by inserting 1589 // the hidden arguments as arg[0] and possibly arg[1] (static method) 1590 1591 const int total_in_args = method->size_of_parameters(); 1592 int total_c_args = total_in_args; 1593 if (!is_critical_native) { 1594 total_c_args += 1; 1595 if (method->is_static()) { 1596 total_c_args++; 1597 } 1598 } else { 1599 for (int i = 0; i < total_in_args; i++) { 1600 if (in_sig_bt[i] == T_ARRAY) { 1601 total_c_args++; 1602 } 1603 } 1604 } 1605 1606 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args); 1607 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args); 1608 BasicType* in_elem_bt = NULL; 1609 1610 int argc = 0; 1611 if (!is_critical_native) { 1612 out_sig_bt[argc++] = T_ADDRESS; 1613 if (method->is_static()) { 1614 out_sig_bt[argc++] = T_OBJECT; 1615 } 1616 1617 for (int i = 0; i < total_in_args ; i++ ) { 1618 out_sig_bt[argc++] = in_sig_bt[i]; 1619 } 1620 } else { 1621 in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_in_args); 1622 SignatureStream ss(method->signature()); 1623 for (int i = 0; i < total_in_args ; i++ ) { 1624 if (in_sig_bt[i] == T_ARRAY) { 1625 // Arrays are passed as int, elem* pair 1626 out_sig_bt[argc++] = T_INT; 1627 out_sig_bt[argc++] = T_ADDRESS; 1628 ss.skip_array_prefix(1); // skip one '[' 1629 assert(ss.is_primitive(), "primitive type expected"); 1630 in_elem_bt[i] = ss.type(); 1631 } else { 1632 out_sig_bt[argc++] = in_sig_bt[i]; 1633 in_elem_bt[i] = T_VOID; 1634 } 1635 if (in_sig_bt[i] != T_VOID) { 1636 assert(in_sig_bt[i] == ss.type() || 1637 in_sig_bt[i] == T_ARRAY, "must match"); 1638 ss.next(); 1639 } 1640 } 1641 } 1642 1643 // Now figure out where the args must be stored and how much stack space 1644 // they require. 1645 int out_arg_slots; 1646 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args); 1647 1648 // Compute framesize for the wrapper. We need to handlize all oops in 1649 // incoming registers 1650 1651 // Calculate the total number of stack slots we will need. 1652 1653 // First count the abi requirement plus all of the outgoing args 1654 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots; 1655 1656 // Now the space for the inbound oop handle area 1657 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers 1658 if (is_critical_native) { 1659 // Critical natives may have to call out so they need a save area 1660 // for register arguments. 1661 int double_slots = 0; 1662 int single_slots = 0; 1663 for ( int i = 0; i < total_in_args; i++) { 1664 if (in_regs[i].first()->is_Register()) { 1665 const Register reg = in_regs[i].first()->as_Register(); 1666 switch (in_sig_bt[i]) { 1667 case T_BOOLEAN: 1668 case T_BYTE: 1669 case T_SHORT: 1670 case T_CHAR: 1671 case T_INT: single_slots++; break; 1672 case T_ARRAY: // specific to LP64 (7145024) 1673 case T_LONG: double_slots++; break; 1674 default: ShouldNotReachHere(); 1675 } 1676 } else if (in_regs[i].first()->is_XMMRegister()) { 1677 switch (in_sig_bt[i]) { 1678 case T_FLOAT: single_slots++; break; 1679 case T_DOUBLE: double_slots++; break; 1680 default: ShouldNotReachHere(); 1681 } 1682 } else if (in_regs[i].first()->is_FloatRegister()) { 1683 ShouldNotReachHere(); 1684 } 1685 } 1686 total_save_slots = double_slots * 2 + single_slots; 1687 // align the save area 1688 if (double_slots != 0) { 1689 stack_slots = align_up(stack_slots, 2); 1690 } 1691 } 1692 1693 int oop_handle_offset = stack_slots; 1694 stack_slots += total_save_slots; 1695 1696 // Now any space we need for handlizing a klass if static method 1697 1698 int klass_slot_offset = 0; 1699 int klass_offset = -1; 1700 int lock_slot_offset = 0; 1701 bool is_static = false; 1702 1703 if (method->is_static()) { 1704 klass_slot_offset = stack_slots; 1705 stack_slots += VMRegImpl::slots_per_word; 1706 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size; 1707 is_static = true; 1708 } 1709 1710 // Plus a lock if needed 1711 1712 if (method->is_synchronized()) { 1713 lock_slot_offset = stack_slots; 1714 stack_slots += VMRegImpl::slots_per_word; 1715 } 1716 1717 // Now a place (+2) to save return values or temp during shuffling 1718 // + 4 for return address (which we own) and saved rbp 1719 stack_slots += 6; 1720 1721 // Ok The space we have allocated will look like: 1722 // 1723 // 1724 // FP-> | | 1725 // |---------------------| 1726 // | 2 slots for moves | 1727 // |---------------------| 1728 // | lock box (if sync) | 1729 // |---------------------| <- lock_slot_offset 1730 // | klass (if static) | 1731 // |---------------------| <- klass_slot_offset 1732 // | oopHandle area | 1733 // |---------------------| <- oop_handle_offset (6 java arg registers) 1734 // | outbound memory | 1735 // | based arguments | 1736 // | | 1737 // |---------------------| 1738 // | | 1739 // SP-> | out_preserved_slots | 1740 // 1741 // 1742 1743 1744 // Now compute actual number of stack words we need rounding to make 1745 // stack properly aligned. 1746 stack_slots = align_up(stack_slots, StackAlignmentInSlots); 1747 1748 int stack_size = stack_slots * VMRegImpl::stack_slot_size; 1749 1750 // First thing make an ic check to see if we should even be here 1751 1752 // We are free to use all registers as temps without saving them and 1753 // restoring them except rbp. rbp is the only callee save register 1754 // as far as the interpreter and the compiler(s) are concerned. 1755 1756 1757 const Register ic_reg = rax; 1758 const Register receiver = j_rarg0; 1759 1760 Label hit; 1761 Label exception_pending; 1762 1763 assert_different_registers(ic_reg, receiver, rscratch1); 1764 __ verify_oop(receiver); 1765 __ load_klass(rscratch1, receiver, rscratch2); 1766 __ cmpq(ic_reg, rscratch1); 1767 __ jcc(Assembler::equal, hit); 1768 1769 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1770 1771 // Verified entry point must be aligned 1772 __ align(8); 1773 1774 __ bind(hit); 1775 1776 int vep_offset = ((intptr_t)__ pc()) - start; 1777 1778 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) { 1779 Label L_skip_barrier; 1780 Register klass = r10; 1781 __ mov_metadata(klass, method->method_holder()); // InstanceKlass* 1782 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1783 1784 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1785 1786 __ bind(L_skip_barrier); 1787 } 1788 1789 #ifdef COMPILER1 1790 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available. 1791 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) { 1792 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/); 1793 } 1794 #endif // COMPILER1 1795 1796 // The instruction at the verified entry point must be 5 bytes or longer 1797 // because it can be patched on the fly by make_non_entrant. The stack bang 1798 // instruction fits that requirement. 1799 1800 // Generate stack overflow check 1801 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size()); 1802 1803 // Generate a new frame for the wrapper. 1804 __ enter(); 1805 // -2 because return address is already present and so is saved rbp 1806 __ subptr(rsp, stack_size - 2*wordSize); 1807 1808 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1809 bs->nmethod_entry_barrier(masm); 1810 1811 // Frame is now completed as far as size and linkage. 1812 int frame_complete = ((intptr_t)__ pc()) - start; 1813 1814 if (UseRTMLocking) { 1815 // Abort RTM transaction before calling JNI 1816 // because critical section will be large and will be 1817 // aborted anyway. Also nmethod could be deoptimized. 1818 __ xabort(0); 1819 } 1820 1821 #ifdef ASSERT 1822 { 1823 Label L; 1824 __ mov(rax, rsp); 1825 __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI) 1826 __ cmpptr(rax, rsp); 1827 __ jcc(Assembler::equal, L); 1828 __ stop("improperly aligned stack"); 1829 __ bind(L); 1830 } 1831 #endif /* ASSERT */ 1832 1833 1834 // We use r14 as the oop handle for the receiver/klass 1835 // It is callee save so it survives the call to native 1836 1837 const Register oop_handle_reg = r14; 1838 1839 // 1840 // We immediately shuffle the arguments so that any vm call we have to 1841 // make from here on out (sync slow path, jvmti, etc.) we will have 1842 // captured the oops from our caller and have a valid oopMap for 1843 // them. 1844 1845 // ----------------- 1846 // The Grand Shuffle 1847 1848 // The Java calling convention is either equal (linux) or denser (win64) than the 1849 // c calling convention. However the because of the jni_env argument the c calling 1850 // convention always has at least one more (and two for static) arguments than Java. 1851 // Therefore if we move the args from java -> c backwards then we will never have 1852 // a register->register conflict and we don't have to build a dependency graph 1853 // and figure out how to break any cycles. 1854 // 1855 1856 // Record esp-based slot for receiver on stack for non-static methods 1857 int receiver_offset = -1; 1858 1859 // This is a trick. We double the stack slots so we can claim 1860 // the oops in the caller's frame. Since we are sure to have 1861 // more args than the caller doubling is enough to make 1862 // sure we can capture all the incoming oop args from the 1863 // caller. 1864 // 1865 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/); 1866 1867 // Mark location of rbp (someday) 1868 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp)); 1869 1870 // Use eax, ebx as temporaries during any memory-memory moves we have to do 1871 // All inbound args are referenced based on rbp and all outbound args via rsp. 1872 1873 1874 #ifdef ASSERT 1875 bool reg_destroyed[RegisterImpl::number_of_registers]; 1876 bool freg_destroyed[XMMRegisterImpl::number_of_registers]; 1877 for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) { 1878 reg_destroyed[r] = false; 1879 } 1880 for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) { 1881 freg_destroyed[f] = false; 1882 } 1883 1884 #endif /* ASSERT */ 1885 1886 // This may iterate in two different directions depending on the 1887 // kind of native it is. The reason is that for regular JNI natives 1888 // the incoming and outgoing registers are offset upwards and for 1889 // critical natives they are offset down. 1890 GrowableArray<int> arg_order(2 * total_in_args); 1891 1892 VMRegPair tmp_vmreg; 1893 tmp_vmreg.set2(rbx->as_VMReg()); 1894 1895 if (!is_critical_native) { 1896 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 1897 arg_order.push(i); 1898 arg_order.push(c_arg); 1899 } 1900 } else { 1901 // Compute a valid move order, using tmp_vmreg to break any cycles 1902 ComputeMoveOrder cmo(total_in_args, in_regs, total_c_args, out_regs, in_sig_bt, arg_order, tmp_vmreg); 1903 } 1904 1905 int temploc = -1; 1906 for (int ai = 0; ai < arg_order.length(); ai += 2) { 1907 int i = arg_order.at(ai); 1908 int c_arg = arg_order.at(ai + 1); 1909 __ block_comment(err_msg("move %d -> %d", i, c_arg)); 1910 if (c_arg == -1) { 1911 assert(is_critical_native, "should only be required for critical natives"); 1912 // This arg needs to be moved to a temporary 1913 __ mov(tmp_vmreg.first()->as_Register(), in_regs[i].first()->as_Register()); 1914 in_regs[i] = tmp_vmreg; 1915 temploc = i; 1916 continue; 1917 } else if (i == -1) { 1918 assert(is_critical_native, "should only be required for critical natives"); 1919 // Read from the temporary location 1920 assert(temploc != -1, "must be valid"); 1921 i = temploc; 1922 temploc = -1; 1923 } 1924 #ifdef ASSERT 1925 if (in_regs[i].first()->is_Register()) { 1926 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!"); 1927 } else if (in_regs[i].first()->is_XMMRegister()) { 1928 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!"); 1929 } 1930 if (out_regs[c_arg].first()->is_Register()) { 1931 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 1932 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 1933 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 1934 } 1935 #endif /* ASSERT */ 1936 switch (in_sig_bt[i]) { 1937 case T_ARRAY: 1938 if (is_critical_native) { 1939 unpack_array_argument(masm, in_regs[i], in_elem_bt[i], out_regs[c_arg + 1], out_regs[c_arg]); 1940 c_arg++; 1941 #ifdef ASSERT 1942 if (out_regs[c_arg].first()->is_Register()) { 1943 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 1944 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 1945 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 1946 } 1947 #endif 1948 break; 1949 } 1950 case T_OBJECT: 1951 assert(!is_critical_native, "no oop arguments"); 1952 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg], 1953 ((i == 0) && (!is_static)), 1954 &receiver_offset); 1955 break; 1956 case T_VOID: 1957 break; 1958 1959 case T_FLOAT: 1960 __ float_move(in_regs[i], out_regs[c_arg]); 1961 break; 1962 1963 case T_DOUBLE: 1964 assert( i + 1 < total_in_args && 1965 in_sig_bt[i + 1] == T_VOID && 1966 out_sig_bt[c_arg+1] == T_VOID, "bad arg list"); 1967 __ double_move(in_regs[i], out_regs[c_arg]); 1968 break; 1969 1970 case T_LONG : 1971 __ long_move(in_regs[i], out_regs[c_arg]); 1972 break; 1973 1974 case T_ADDRESS: assert(false, "found T_ADDRESS in java args"); 1975 1976 default: 1977 __ move32_64(in_regs[i], out_regs[c_arg]); 1978 } 1979 } 1980 1981 int c_arg; 1982 1983 // Pre-load a static method's oop into r14. Used both by locking code and 1984 // the normal JNI call code. 1985 if (!is_critical_native) { 1986 // point c_arg at the first arg that is already loaded in case we 1987 // need to spill before we call out 1988 c_arg = total_c_args - total_in_args; 1989 1990 if (method->is_static()) { 1991 1992 // load oop into a register 1993 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror())); 1994 1995 // Now handlize the static class mirror it's known not-null. 1996 __ movptr(Address(rsp, klass_offset), oop_handle_reg); 1997 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset)); 1998 1999 // Now get the handle 2000 __ lea(oop_handle_reg, Address(rsp, klass_offset)); 2001 // store the klass handle as second argument 2002 __ movptr(c_rarg1, oop_handle_reg); 2003 // and protect the arg if we must spill 2004 c_arg--; 2005 } 2006 } else { 2007 // For JNI critical methods we need to save all registers in save_args. 2008 c_arg = 0; 2009 } 2010 2011 // Change state to native (we save the return address in the thread, since it might not 2012 // be pushed on the stack when we do a a stack traversal). It is enough that the pc() 2013 // points into the right code segment. It does not have to be the correct return pc. 2014 // We use the same pc/oopMap repeatedly when we call out 2015 2016 intptr_t the_pc = (intptr_t) __ pc(); 2017 oop_maps->add_gc_map(the_pc - start, map); 2018 2019 __ set_last_Java_frame(rsp, noreg, (address)the_pc); 2020 2021 2022 // We have all of the arguments setup at this point. We must not touch any register 2023 // argument registers at this point (what if we save/restore them there are no oop? 2024 2025 { 2026 SkipIfEqual skip(masm, &DTraceMethodProbes, false); 2027 // protect the args we've loaded 2028 save_args(masm, total_c_args, c_arg, out_regs); 2029 __ mov_metadata(c_rarg1, method()); 2030 __ call_VM_leaf( 2031 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry), 2032 r15_thread, c_rarg1); 2033 restore_args(masm, total_c_args, c_arg, out_regs); 2034 } 2035 2036 // RedefineClasses() tracing support for obsolete method entry 2037 if (log_is_enabled(Trace, redefine, class, obsolete)) { 2038 // protect the args we've loaded 2039 save_args(masm, total_c_args, c_arg, out_regs); 2040 __ mov_metadata(c_rarg1, method()); 2041 __ call_VM_leaf( 2042 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry), 2043 r15_thread, c_rarg1); 2044 restore_args(masm, total_c_args, c_arg, out_regs); 2045 } 2046 2047 // Lock a synchronized method 2048 2049 // Register definitions used by locking and unlocking 2050 2051 const Register swap_reg = rax; // Must use rax for cmpxchg instruction 2052 const Register obj_reg = rbx; // Will contain the oop 2053 const Register lock_reg = r13; // Address of compiler lock object (BasicLock) 2054 const Register old_hdr = r13; // value of old header at unlock time 2055 2056 Label slow_path_lock; 2057 Label lock_done; 2058 2059 if (method->is_synchronized()) { 2060 assert(!is_critical_native, "unhandled"); 2061 2062 2063 const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes(); 2064 2065 // Get the handle (the 2nd argument) 2066 __ mov(oop_handle_reg, c_rarg1); 2067 2068 // Get address of the box 2069 2070 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2071 2072 // Load the oop from the handle 2073 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2074 2075 if (UseFastLocking) { 2076 // Load object header 2077 __ movptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2078 __ fast_lock_impl(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock); 2079 } else { 2080 if (UseBiasedLocking) { 2081 __ biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch1, rscratch2, false, lock_done, &slow_path_lock); 2082 } 2083 2084 // Load immediate 1 into swap_reg %rax 2085 __ movl(swap_reg, 1); 2086 2087 // Load (object->mark() | 1) into swap_reg %rax 2088 __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2089 2090 // Save (object->mark() | 1) into BasicLock's displaced header 2091 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2092 2093 // src -> dest iff dest == rax else rax <- dest 2094 __ lock(); 2095 __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2096 __ jcc(Assembler::equal, lock_done); 2097 2098 // Hmm should this move to the slow path code area??? 2099 2100 // Test if the oopMark is an obvious stack pointer, i.e., 2101 // 1) (mark & 3) == 0, and 2102 // 2) rsp <= mark < mark + os::pagesize() 2103 // These 3 tests can be done by evaluating the following 2104 // expression: ((mark - rsp) & (3 - os::vm_page_size())), 2105 // assuming both stack pointer and pagesize have their 2106 // least significant 2 bits clear. 2107 // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg 2108 2109 __ subptr(swap_reg, rsp); 2110 __ andptr(swap_reg, 3 - os::vm_page_size()); 2111 2112 // Save the test result, for recursive case, the result is zero 2113 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2114 __ jcc(Assembler::notEqual, slow_path_lock); 2115 } 2116 2117 // Slow path will re-enter here 2118 2119 __ bind(lock_done); 2120 } 2121 2122 // Finally just about ready to make the JNI call 2123 2124 // get JNIEnv* which is first argument to native 2125 if (!is_critical_native) { 2126 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset()))); 2127 2128 // Now set thread in native 2129 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 2130 } 2131 2132 __ call(RuntimeAddress(native_func)); 2133 2134 // Verify or restore cpu control state after JNI call 2135 __ restore_cpu_control_state_after_jni(); 2136 2137 // Unpack native results. 2138 switch (ret_type) { 2139 case T_BOOLEAN: __ c2bool(rax); break; 2140 case T_CHAR : __ movzwl(rax, rax); break; 2141 case T_BYTE : __ sign_extend_byte (rax); break; 2142 case T_SHORT : __ sign_extend_short(rax); break; 2143 case T_INT : /* nothing to do */ break; 2144 case T_DOUBLE : 2145 case T_FLOAT : 2146 // Result is in xmm0 we'll save as needed 2147 break; 2148 case T_ARRAY: // Really a handle 2149 case T_OBJECT: // Really a handle 2150 break; // can't de-handlize until after safepoint check 2151 case T_VOID: break; 2152 case T_LONG: break; 2153 default : ShouldNotReachHere(); 2154 } 2155 2156 Label after_transition; 2157 2158 // If this is a critical native, check for a safepoint or suspend request after the call. 2159 // If a safepoint is needed, transition to native, then to native_trans to handle 2160 // safepoints like the native methods that are not critical natives. 2161 if (is_critical_native) { 2162 Label needs_safepoint; 2163 __ safepoint_poll(needs_safepoint, r15_thread, false /* at_return */, false /* in_nmethod */); 2164 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2165 __ jcc(Assembler::equal, after_transition); 2166 __ bind(needs_safepoint); 2167 } 2168 2169 // Switch thread to "native transition" state before reading the synchronization state. 2170 // This additional state is necessary because reading and testing the synchronization 2171 // state is not atomic w.r.t. GC, as this scenario demonstrates: 2172 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted. 2173 // VM thread changes sync state to synchronizing and suspends threads for GC. 2174 // Thread A is resumed to finish this native method, but doesn't block here since it 2175 // didn't see any synchronization is progress, and escapes. 2176 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 2177 2178 // Force this write out before the read below 2179 __ membar(Assembler::Membar_mask_bits( 2180 Assembler::LoadLoad | Assembler::LoadStore | 2181 Assembler::StoreLoad | Assembler::StoreStore)); 2182 2183 // check for safepoint operation in progress and/or pending suspend requests 2184 { 2185 Label Continue; 2186 Label slow_path; 2187 2188 __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */); 2189 2190 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2191 __ jcc(Assembler::equal, Continue); 2192 __ bind(slow_path); 2193 2194 // Don't use call_VM as it will see a possible pending exception and forward it 2195 // and never return here preventing us from clearing _last_native_pc down below. 2196 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are 2197 // preserved and correspond to the bcp/locals pointers. So we do a runtime call 2198 // by hand. 2199 // 2200 __ vzeroupper(); 2201 save_native_result(masm, ret_type, stack_slots); 2202 __ mov(c_rarg0, r15_thread); 2203 __ mov(r12, rsp); // remember sp 2204 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2205 __ andptr(rsp, -16); // align stack as required by ABI 2206 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 2207 __ mov(rsp, r12); // restore sp 2208 __ reinit_heapbase(); 2209 // Restore any method result value 2210 restore_native_result(masm, ret_type, stack_slots); 2211 __ bind(Continue); 2212 } 2213 2214 // change thread state 2215 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 2216 __ bind(after_transition); 2217 2218 Label reguard; 2219 Label reguard_done; 2220 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 2221 __ jcc(Assembler::equal, reguard); 2222 __ bind(reguard_done); 2223 2224 // native result if any is live 2225 2226 // Unlock 2227 Label unlock_done; 2228 Label slow_path_unlock; 2229 if (method->is_synchronized()) { 2230 2231 // Get locked oop from the handle we passed to jni 2232 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2233 2234 Label done; 2235 2236 if (UseBiasedLocking) { 2237 __ biased_locking_exit(obj_reg, old_hdr, done); 2238 } 2239 2240 if (!UseFastLocking) { 2241 // Simple recursive lock? 2242 2243 __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD); 2244 __ jcc(Assembler::equal, done); 2245 } 2246 2247 // Must save rax if if it is live now because cmpxchg must use it 2248 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2249 save_native_result(masm, ret_type, stack_slots); 2250 } 2251 2252 if (UseFastLocking) { 2253 __ movptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2254 __ andptr(swap_reg, ~(int32_t)markWord::lock_mask_in_place); 2255 __ fast_unlock_impl(obj_reg, swap_reg, lock_reg, slow_path_unlock); 2256 } else { 2257 // get address of the stack lock 2258 __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2259 // get old displaced header 2260 __ movptr(old_hdr, Address(rax, 0)); 2261 2262 // Atomic swap old header if oop still contains the stack lock 2263 __ lock(); 2264 __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2265 __ jcc(Assembler::notEqual, slow_path_unlock); 2266 } 2267 2268 // slow path re-enters here 2269 __ bind(unlock_done); 2270 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2271 restore_native_result(masm, ret_type, stack_slots); 2272 } 2273 2274 __ bind(done); 2275 2276 } 2277 { 2278 SkipIfEqual skip(masm, &DTraceMethodProbes, false); 2279 save_native_result(masm, ret_type, stack_slots); 2280 __ mov_metadata(c_rarg1, method()); 2281 __ call_VM_leaf( 2282 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit), 2283 r15_thread, c_rarg1); 2284 restore_native_result(masm, ret_type, stack_slots); 2285 } 2286 2287 __ reset_last_Java_frame(false); 2288 2289 // Unbox oop result, e.g. JNIHandles::resolve value. 2290 if (is_reference_type(ret_type)) { 2291 __ resolve_jobject(rax /* value */, 2292 r15_thread /* thread */, 2293 rcx /* tmp */); 2294 } 2295 2296 if (CheckJNICalls) { 2297 // clear_pending_jni_exception_check 2298 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD); 2299 } 2300 2301 if (!is_critical_native) { 2302 // reset handle block 2303 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset())); 2304 __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD); 2305 } 2306 2307 // pop our frame 2308 2309 __ leave(); 2310 2311 if (!is_critical_native) { 2312 // Any exception pending? 2313 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD); 2314 __ jcc(Assembler::notEqual, exception_pending); 2315 } 2316 2317 // Return 2318 2319 __ ret(0); 2320 2321 // Unexpected paths are out of line and go here 2322 2323 if (!is_critical_native) { 2324 // forward the exception 2325 __ bind(exception_pending); 2326 2327 // and forward the exception 2328 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 2329 } 2330 2331 // Slow path locking & unlocking 2332 if (method->is_synchronized()) { 2333 2334 // BEGIN Slow path lock 2335 __ bind(slow_path_lock); 2336 2337 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM 2338 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2339 2340 // protect the args we've loaded 2341 save_args(masm, total_c_args, c_arg, out_regs); 2342 2343 __ mov(c_rarg0, obj_reg); 2344 __ mov(c_rarg1, lock_reg); 2345 __ mov(c_rarg2, r15_thread); 2346 2347 // Not a leaf but we have last_Java_frame setup as we want 2348 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3); 2349 restore_args(masm, total_c_args, c_arg, out_regs); 2350 2351 #ifdef ASSERT 2352 { Label L; 2353 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD); 2354 __ jcc(Assembler::equal, L); 2355 __ stop("no pending exception allowed on exit from monitorenter"); 2356 __ bind(L); 2357 } 2358 #endif 2359 __ jmp(lock_done); 2360 2361 // END Slow path lock 2362 2363 // BEGIN Slow path unlock 2364 __ bind(slow_path_unlock); 2365 2366 // If we haven't already saved the native result we must save it now as xmm registers 2367 // are still exposed. 2368 __ vzeroupper(); 2369 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2370 save_native_result(masm, ret_type, stack_slots); 2371 } 2372 2373 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2374 2375 __ mov(c_rarg0, obj_reg); 2376 __ mov(c_rarg2, r15_thread); 2377 __ mov(r12, rsp); // remember sp 2378 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2379 __ andptr(rsp, -16); // align stack as required by ABI 2380 2381 // Save pending exception around call to VM (which contains an EXCEPTION_MARK) 2382 // NOTE that obj_reg == rbx currently 2383 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset()))); 2384 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD); 2385 2386 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2387 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C))); 2388 __ mov(rsp, r12); // restore sp 2389 __ reinit_heapbase(); 2390 #ifdef ASSERT 2391 { 2392 Label L; 2393 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD); 2394 __ jcc(Assembler::equal, L); 2395 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C"); 2396 __ bind(L); 2397 } 2398 #endif /* ASSERT */ 2399 2400 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx); 2401 2402 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2403 restore_native_result(masm, ret_type, stack_slots); 2404 } 2405 __ jmp(unlock_done); 2406 2407 // END Slow path unlock 2408 2409 } // synchronized 2410 2411 // SLOW PATH Reguard the stack if needed 2412 2413 __ bind(reguard); 2414 __ vzeroupper(); 2415 save_native_result(masm, ret_type, stack_slots); 2416 __ mov(r12, rsp); // remember sp 2417 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2418 __ andptr(rsp, -16); // align stack as required by ABI 2419 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 2420 __ mov(rsp, r12); // restore sp 2421 __ reinit_heapbase(); 2422 restore_native_result(masm, ret_type, stack_slots); 2423 // and continue 2424 __ jmp(reguard_done); 2425 2426 2427 2428 __ flush(); 2429 2430 nmethod *nm = nmethod::new_native_nmethod(method, 2431 compile_id, 2432 masm->code(), 2433 vep_offset, 2434 frame_complete, 2435 stack_slots / VMRegImpl::slots_per_word, 2436 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)), 2437 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size), 2438 oop_maps); 2439 2440 return nm; 2441 } 2442 2443 // this function returns the adjust size (in number of words) to a c2i adapter 2444 // activation for use during deoptimization 2445 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) { 2446 return (callee_locals - callee_parameters) * Interpreter::stackElementWords; 2447 } 2448 2449 2450 uint SharedRuntime::out_preserve_stack_slots() { 2451 return 0; 2452 } 2453 2454 2455 // Number of stack slots between incoming argument block and the start of 2456 // a new frame. The PROLOG must add this many slots to the stack. The 2457 // EPILOG must remove this many slots. amd64 needs two slots for 2458 // return address. 2459 uint SharedRuntime::in_preserve_stack_slots() { 2460 return 4 + 2 * VerifyStackAtCalls; 2461 } 2462 2463 //------------------------------generate_deopt_blob---------------------------- 2464 void SharedRuntime::generate_deopt_blob() { 2465 // Allocate space for the code 2466 ResourceMark rm; 2467 // Setup code generation tools 2468 int pad = 0; 2469 if (UseAVX > 2) { 2470 pad += 1024; 2471 } 2472 #if INCLUDE_JVMCI 2473 if (EnableJVMCI) { 2474 pad += 512; // Increase the buffer size when compiling for JVMCI 2475 } 2476 #endif 2477 CodeBuffer buffer("deopt_blob", 2560+pad, 1024); 2478 MacroAssembler* masm = new MacroAssembler(&buffer); 2479 int frame_size_in_words; 2480 OopMap* map = NULL; 2481 OopMapSet *oop_maps = new OopMapSet(); 2482 2483 // ------------- 2484 // This code enters when returning to a de-optimized nmethod. A return 2485 // address has been pushed on the the stack, and return values are in 2486 // registers. 2487 // If we are doing a normal deopt then we were called from the patched 2488 // nmethod from the point we returned to the nmethod. So the return 2489 // address on the stack is wrong by NativeCall::instruction_size 2490 // We will adjust the value so it looks like we have the original return 2491 // address on the stack (like when we eagerly deoptimized). 2492 // In the case of an exception pending when deoptimizing, we enter 2493 // with a return address on the stack that points after the call we patched 2494 // into the exception handler. We have the following register state from, 2495 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp). 2496 // rax: exception oop 2497 // rbx: exception handler 2498 // rdx: throwing pc 2499 // So in this case we simply jam rdx into the useless return address and 2500 // the stack looks just like we want. 2501 // 2502 // At this point we need to de-opt. We save the argument return 2503 // registers. We call the first C routine, fetch_unroll_info(). This 2504 // routine captures the return values and returns a structure which 2505 // describes the current frame size and the sizes of all replacement frames. 2506 // The current frame is compiled code and may contain many inlined 2507 // functions, each with their own JVM state. We pop the current frame, then 2508 // push all the new frames. Then we call the C routine unpack_frames() to 2509 // populate these frames. Finally unpack_frames() returns us the new target 2510 // address. Notice that callee-save registers are BLOWN here; they have 2511 // already been captured in the vframeArray at the time the return PC was 2512 // patched. 2513 address start = __ pc(); 2514 Label cont; 2515 2516 // Prolog for non exception case! 2517 2518 // Save everything in sight. 2519 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2520 2521 // Normal deoptimization. Save exec mode for unpack_frames. 2522 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved 2523 __ jmp(cont); 2524 2525 int reexecute_offset = __ pc() - start; 2526 #if INCLUDE_JVMCI && !defined(COMPILER1) 2527 if (EnableJVMCI && UseJVMCICompiler) { 2528 // JVMCI does not use this kind of deoptimization 2529 __ should_not_reach_here(); 2530 } 2531 #endif 2532 2533 // Reexecute case 2534 // return address is the pc describes what bci to do re-execute at 2535 2536 // No need to update map as each call to save_live_registers will produce identical oopmap 2537 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2538 2539 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved 2540 __ jmp(cont); 2541 2542 #if INCLUDE_JVMCI 2543 Label after_fetch_unroll_info_call; 2544 int implicit_exception_uncommon_trap_offset = 0; 2545 int uncommon_trap_offset = 0; 2546 2547 if (EnableJVMCI) { 2548 implicit_exception_uncommon_trap_offset = __ pc() - start; 2549 2550 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset()))); 2551 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), (int32_t)NULL_WORD); 2552 2553 uncommon_trap_offset = __ pc() - start; 2554 2555 // Save everything in sight. 2556 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2557 // fetch_unroll_info needs to call last_java_frame() 2558 __ set_last_Java_frame(noreg, noreg, NULL); 2559 2560 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset()))); 2561 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1); 2562 2563 __ movl(r14, (int32_t)Deoptimization::Unpack_reexecute); 2564 __ mov(c_rarg0, r15_thread); 2565 __ movl(c_rarg2, r14); // exec mode 2566 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2567 oop_maps->add_gc_map( __ pc()-start, map->deep_copy()); 2568 2569 __ reset_last_Java_frame(false); 2570 2571 __ jmp(after_fetch_unroll_info_call); 2572 } // EnableJVMCI 2573 #endif // INCLUDE_JVMCI 2574 2575 int exception_offset = __ pc() - start; 2576 2577 // Prolog for exception case 2578 2579 // all registers are dead at this entry point, except for rax, and 2580 // rdx which contain the exception oop and exception pc 2581 // respectively. Set them in TLS and fall thru to the 2582 // unpack_with_exception_in_tls entry point. 2583 2584 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 2585 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax); 2586 2587 int exception_in_tls_offset = __ pc() - start; 2588 2589 // new implementation because exception oop is now passed in JavaThread 2590 2591 // Prolog for exception case 2592 // All registers must be preserved because they might be used by LinearScan 2593 // Exceptiop oop and throwing PC are passed in JavaThread 2594 // tos: stack at point of call to method that threw the exception (i.e. only 2595 // args are on the stack, no return address) 2596 2597 // make room on stack for the return address 2598 // It will be patched later with the throwing pc. The correct value is not 2599 // available now because loading it from memory would destroy registers. 2600 __ push(0); 2601 2602 // Save everything in sight. 2603 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2604 2605 // Now it is safe to overwrite any register 2606 2607 // Deopt during an exception. Save exec mode for unpack_frames. 2608 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved 2609 2610 // load throwing pc from JavaThread and patch it as the return address 2611 // of the current frame. Then clear the field in JavaThread 2612 2613 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2614 __ movptr(Address(rbp, wordSize), rdx); 2615 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD); 2616 2617 #ifdef ASSERT 2618 // verify that there is really an exception oop in JavaThread 2619 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2620 __ verify_oop(rax); 2621 2622 // verify that there is no pending exception 2623 Label no_pending_exception; 2624 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 2625 __ testptr(rax, rax); 2626 __ jcc(Assembler::zero, no_pending_exception); 2627 __ stop("must not have pending exception here"); 2628 __ bind(no_pending_exception); 2629 #endif 2630 2631 __ bind(cont); 2632 2633 // Call C code. Need thread and this frame, but NOT official VM entry 2634 // crud. We cannot block on this call, no GC can happen. 2635 // 2636 // UnrollBlock* fetch_unroll_info(JavaThread* thread) 2637 2638 // fetch_unroll_info needs to call last_java_frame(). 2639 2640 __ set_last_Java_frame(noreg, noreg, NULL); 2641 #ifdef ASSERT 2642 { Label L; 2643 __ cmpptr(Address(r15_thread, 2644 JavaThread::last_Java_fp_offset()), 2645 (int32_t)0); 2646 __ jcc(Assembler::equal, L); 2647 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared"); 2648 __ bind(L); 2649 } 2650 #endif // ASSERT 2651 __ mov(c_rarg0, r15_thread); 2652 __ movl(c_rarg1, r14); // exec_mode 2653 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info))); 2654 2655 // Need to have an oopmap that tells fetch_unroll_info where to 2656 // find any register it might need. 2657 oop_maps->add_gc_map(__ pc() - start, map); 2658 2659 __ reset_last_Java_frame(false); 2660 2661 #if INCLUDE_JVMCI 2662 if (EnableJVMCI) { 2663 __ bind(after_fetch_unroll_info_call); 2664 } 2665 #endif 2666 2667 // Load UnrollBlock* into rdi 2668 __ mov(rdi, rax); 2669 2670 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes())); 2671 Label noException; 2672 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending? 2673 __ jcc(Assembler::notEqual, noException); 2674 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2675 // QQQ this is useless it was NULL above 2676 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2677 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD); 2678 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD); 2679 2680 __ verify_oop(rax); 2681 2682 // Overwrite the result registers with the exception results. 2683 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2684 // I think this is useless 2685 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx); 2686 2687 __ bind(noException); 2688 2689 // Only register save data is on the stack. 2690 // Now restore the result registers. Everything else is either dead 2691 // or captured in the vframeArray. 2692 RegisterSaver::restore_result_registers(masm); 2693 2694 // All of the register save area has been popped of the stack. Only the 2695 // return address remains. 2696 2697 // Pop all the frames we must move/replace. 2698 // 2699 // Frame picture (youngest to oldest) 2700 // 1: self-frame (no frame link) 2701 // 2: deopting frame (no frame link) 2702 // 3: caller of deopting frame (could be compiled/interpreted). 2703 // 2704 // Note: by leaving the return address of self-frame on the stack 2705 // and using the size of frame 2 to adjust the stack 2706 // when we are done the return to frame 3 will still be on the stack. 2707 2708 // Pop deoptimized frame 2709 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes())); 2710 __ addptr(rsp, rcx); 2711 2712 // rsp should be pointing at the return address to the caller (3) 2713 2714 // Pick up the initial fp we should save 2715 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2716 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes())); 2717 2718 #ifdef ASSERT 2719 // Compilers generate code that bang the stack by as much as the 2720 // interpreter would need. So this stack banging should never 2721 // trigger a fault. Verify that it does not on non product builds. 2722 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes())); 2723 __ bang_stack_size(rbx, rcx); 2724 #endif 2725 2726 // Load address of array of frame pcs into rcx 2727 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes())); 2728 2729 // Trash the old pc 2730 __ addptr(rsp, wordSize); 2731 2732 // Load address of array of frame sizes into rsi 2733 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes())); 2734 2735 // Load counter into rdx 2736 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes())); 2737 2738 // Now adjust the caller's stack to make up for the extra locals 2739 // but record the original sp so that we can save it in the skeletal interpreter 2740 // frame and the stack walking of interpreter_sender will get the unextended sp 2741 // value and not the "real" sp value. 2742 2743 const Register sender_sp = r8; 2744 2745 __ mov(sender_sp, rsp); 2746 __ movl(rbx, Address(rdi, 2747 Deoptimization::UnrollBlock:: 2748 caller_adjustment_offset_in_bytes())); 2749 __ subptr(rsp, rbx); 2750 2751 // Push interpreter frames in a loop 2752 Label loop; 2753 __ bind(loop); 2754 __ movptr(rbx, Address(rsi, 0)); // Load frame size 2755 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand 2756 __ pushptr(Address(rcx, 0)); // Save return address 2757 __ enter(); // Save old & set new ebp 2758 __ subptr(rsp, rbx); // Prolog 2759 // This value is corrected by layout_activation_impl 2760 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD ); 2761 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable 2762 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 2763 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 2764 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 2765 __ decrementl(rdx); // Decrement counter 2766 __ jcc(Assembler::notZero, loop); 2767 __ pushptr(Address(rcx, 0)); // Save final return address 2768 2769 // Re-push self-frame 2770 __ enter(); // Save old & set new ebp 2771 2772 // Allocate a full sized register save area. 2773 // Return address and rbp are in place, so we allocate two less words. 2774 __ subptr(rsp, (frame_size_in_words - 2) * wordSize); 2775 2776 // Restore frame locals after moving the frame 2777 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0); 2778 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2779 2780 // Call C code. Need thread but NOT official VM entry 2781 // crud. We cannot block on this call, no GC can happen. Call should 2782 // restore return values to their stack-slots with the new SP. 2783 // 2784 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode) 2785 2786 // Use rbp because the frames look interpreted now 2787 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 2788 // Don't need the precise return PC here, just precise enough to point into this code blob. 2789 address the_pc = __ pc(); 2790 __ set_last_Java_frame(noreg, rbp, the_pc); 2791 2792 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI 2793 __ mov(c_rarg0, r15_thread); 2794 __ movl(c_rarg1, r14); // second arg: exec_mode 2795 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 2796 // Revert SP alignment after call since we're going to do some SP relative addressing below 2797 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset())); 2798 2799 // Set an oopmap for the call site 2800 // Use the same PC we used for the last java frame 2801 oop_maps->add_gc_map(the_pc - start, 2802 new OopMap( frame_size_in_words, 0 )); 2803 2804 // Clear fp AND pc 2805 __ reset_last_Java_frame(true); 2806 2807 // Collect return values 2808 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes())); 2809 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes())); 2810 // I think this is useless (throwing pc?) 2811 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes())); 2812 2813 // Pop self-frame. 2814 __ leave(); // Epilog 2815 2816 // Jump to interpreter 2817 __ ret(0); 2818 2819 // Make sure all code is generated 2820 masm->flush(); 2821 2822 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words); 2823 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset); 2824 #if INCLUDE_JVMCI 2825 if (EnableJVMCI) { 2826 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset); 2827 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset); 2828 } 2829 #endif 2830 } 2831 2832 #ifdef COMPILER2 2833 //------------------------------generate_uncommon_trap_blob-------------------- 2834 void SharedRuntime::generate_uncommon_trap_blob() { 2835 // Allocate space for the code 2836 ResourceMark rm; 2837 // Setup code generation tools 2838 CodeBuffer buffer("uncommon_trap_blob", 2048, 1024); 2839 MacroAssembler* masm = new MacroAssembler(&buffer); 2840 2841 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 2842 2843 address start = __ pc(); 2844 2845 if (UseRTMLocking) { 2846 // Abort RTM transaction before possible nmethod deoptimization. 2847 __ xabort(0); 2848 } 2849 2850 // Push self-frame. We get here with a return address on the 2851 // stack, so rsp is 8-byte aligned until we allocate our frame. 2852 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog! 2853 2854 // No callee saved registers. rbp is assumed implicitly saved 2855 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 2856 2857 // compiler left unloaded_class_index in j_rarg0 move to where the 2858 // runtime expects it. 2859 __ movl(c_rarg1, j_rarg0); 2860 2861 __ set_last_Java_frame(noreg, noreg, NULL); 2862 2863 // Call C code. Need thread but NOT official VM entry 2864 // crud. We cannot block on this call, no GC can happen. Call should 2865 // capture callee-saved registers as well as return values. 2866 // Thread is in rdi already. 2867 // 2868 // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index); 2869 2870 __ mov(c_rarg0, r15_thread); 2871 __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap); 2872 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2873 2874 // Set an oopmap for the call site 2875 OopMapSet* oop_maps = new OopMapSet(); 2876 OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0); 2877 2878 // location of rbp is known implicitly by the frame sender code 2879 2880 oop_maps->add_gc_map(__ pc() - start, map); 2881 2882 __ reset_last_Java_frame(false); 2883 2884 // Load UnrollBlock* into rdi 2885 __ mov(rdi, rax); 2886 2887 #ifdef ASSERT 2888 { Label L; 2889 __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()), 2890 (int32_t)Deoptimization::Unpack_uncommon_trap); 2891 __ jcc(Assembler::equal, L); 2892 __ stop("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap"); 2893 __ bind(L); 2894 } 2895 #endif 2896 2897 // Pop all the frames we must move/replace. 2898 // 2899 // Frame picture (youngest to oldest) 2900 // 1: self-frame (no frame link) 2901 // 2: deopting frame (no frame link) 2902 // 3: caller of deopting frame (could be compiled/interpreted). 2903 2904 // Pop self-frame. We have no frame, and must rely only on rax and rsp. 2905 __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog! 2906 2907 // Pop deoptimized frame (int) 2908 __ movl(rcx, Address(rdi, 2909 Deoptimization::UnrollBlock:: 2910 size_of_deoptimized_frame_offset_in_bytes())); 2911 __ addptr(rsp, rcx); 2912 2913 // rsp should be pointing at the return address to the caller (3) 2914 2915 // Pick up the initial fp we should save 2916 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2917 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes())); 2918 2919 #ifdef ASSERT 2920 // Compilers generate code that bang the stack by as much as the 2921 // interpreter would need. So this stack banging should never 2922 // trigger a fault. Verify that it does not on non product builds. 2923 __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes())); 2924 __ bang_stack_size(rbx, rcx); 2925 #endif 2926 2927 // Load address of array of frame pcs into rcx (address*) 2928 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes())); 2929 2930 // Trash the return pc 2931 __ addptr(rsp, wordSize); 2932 2933 // Load address of array of frame sizes into rsi (intptr_t*) 2934 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes())); 2935 2936 // Counter 2937 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int) 2938 2939 // Now adjust the caller's stack to make up for the extra locals but 2940 // record the original sp so that we can save it in the skeletal 2941 // interpreter frame and the stack walking of interpreter_sender 2942 // will get the unextended sp value and not the "real" sp value. 2943 2944 const Register sender_sp = r8; 2945 2946 __ mov(sender_sp, rsp); 2947 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int) 2948 __ subptr(rsp, rbx); 2949 2950 // Push interpreter frames in a loop 2951 Label loop; 2952 __ bind(loop); 2953 __ movptr(rbx, Address(rsi, 0)); // Load frame size 2954 __ subptr(rbx, 2 * wordSize); // We'll push pc and rbp by hand 2955 __ pushptr(Address(rcx, 0)); // Save return address 2956 __ enter(); // Save old & set new rbp 2957 __ subptr(rsp, rbx); // Prolog 2958 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), 2959 sender_sp); // Make it walkable 2960 // This value is corrected by layout_activation_impl 2961 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD ); 2962 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 2963 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 2964 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 2965 __ decrementl(rdx); // Decrement counter 2966 __ jcc(Assembler::notZero, loop); 2967 __ pushptr(Address(rcx, 0)); // Save final return address 2968 2969 // Re-push self-frame 2970 __ enter(); // Save old & set new rbp 2971 __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt); 2972 // Prolog 2973 2974 // Use rbp because the frames look interpreted now 2975 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 2976 // Don't need the precise return PC here, just precise enough to point into this code blob. 2977 address the_pc = __ pc(); 2978 __ set_last_Java_frame(noreg, rbp, the_pc); 2979 2980 // Call C code. Need thread but NOT official VM entry 2981 // crud. We cannot block on this call, no GC can happen. Call should 2982 // restore return values to their stack-slots with the new SP. 2983 // Thread is in rdi already. 2984 // 2985 // BasicType unpack_frames(JavaThread* thread, int exec_mode); 2986 2987 __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI 2988 __ mov(c_rarg0, r15_thread); 2989 __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap); 2990 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 2991 2992 // Set an oopmap for the call site 2993 // Use the same PC we used for the last java frame 2994 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 2995 2996 // Clear fp AND pc 2997 __ reset_last_Java_frame(true); 2998 2999 // Pop self-frame. 3000 __ leave(); // Epilog 3001 3002 // Jump to interpreter 3003 __ ret(0); 3004 3005 // Make sure all code is generated 3006 masm->flush(); 3007 3008 _uncommon_trap_blob = UncommonTrapBlob::create(&buffer, oop_maps, 3009 SimpleRuntimeFrame::framesize >> 1); 3010 } 3011 #endif // COMPILER2 3012 3013 //------------------------------generate_handler_blob------ 3014 // 3015 // Generate a special Compile2Runtime blob that saves all registers, 3016 // and setup oopmap. 3017 // 3018 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) { 3019 assert(StubRoutines::forward_exception_entry() != NULL, 3020 "must be generated before"); 3021 3022 ResourceMark rm; 3023 OopMapSet *oop_maps = new OopMapSet(); 3024 OopMap* map; 3025 3026 // Allocate space for the code. Setup code generation tools. 3027 CodeBuffer buffer("handler_blob", 2048, 1024); 3028 MacroAssembler* masm = new MacroAssembler(&buffer); 3029 3030 address start = __ pc(); 3031 address call_pc = NULL; 3032 int frame_size_in_words; 3033 bool cause_return = (poll_type == POLL_AT_RETURN); 3034 bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP); 3035 3036 if (UseRTMLocking) { 3037 // Abort RTM transaction before calling runtime 3038 // because critical section will be large and will be 3039 // aborted anyway. Also nmethod could be deoptimized. 3040 __ xabort(0); 3041 } 3042 3043 // Make room for return address (or push it again) 3044 if (!cause_return) { 3045 __ push(rbx); 3046 } 3047 3048 // Save registers, fpu state, and flags 3049 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors); 3050 3051 // The following is basically a call_VM. However, we need the precise 3052 // address of the call in order to generate an oopmap. Hence, we do all the 3053 // work outselves. 3054 3055 __ set_last_Java_frame(noreg, noreg, NULL); 3056 3057 // The return address must always be correct so that frame constructor never 3058 // sees an invalid pc. 3059 3060 if (!cause_return) { 3061 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack. 3062 // Additionally, rbx is a callee saved register and we can look at it later to determine 3063 // if someone changed the return address for us! 3064 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset())); 3065 __ movptr(Address(rbp, wordSize), rbx); 3066 } 3067 3068 // Do the call 3069 __ mov(c_rarg0, r15_thread); 3070 __ call(RuntimeAddress(call_ptr)); 3071 3072 // Set an oopmap for the call site. This oopmap will map all 3073 // oop-registers and debug-info registers as callee-saved. This 3074 // will allow deoptimization at this safepoint to find all possible 3075 // debug-info recordings, as well as let GC find all oops. 3076 3077 oop_maps->add_gc_map( __ pc() - start, map); 3078 3079 Label noException; 3080 3081 __ reset_last_Java_frame(false); 3082 3083 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD); 3084 __ jcc(Assembler::equal, noException); 3085 3086 // Exception pending 3087 3088 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3089 3090 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3091 3092 // No exception case 3093 __ bind(noException); 3094 3095 Label no_adjust; 3096 #ifdef ASSERT 3097 Label bail; 3098 #endif 3099 if (!cause_return) { 3100 Label no_prefix, not_special; 3101 3102 // If our stashed return pc was modified by the runtime we avoid touching it 3103 __ cmpptr(rbx, Address(rbp, wordSize)); 3104 __ jccb(Assembler::notEqual, no_adjust); 3105 3106 // Skip over the poll instruction. 3107 // See NativeInstruction::is_safepoint_poll() 3108 // Possible encodings: 3109 // 85 00 test %eax,(%rax) 3110 // 85 01 test %eax,(%rcx) 3111 // 85 02 test %eax,(%rdx) 3112 // 85 03 test %eax,(%rbx) 3113 // 85 06 test %eax,(%rsi) 3114 // 85 07 test %eax,(%rdi) 3115 // 3116 // 41 85 00 test %eax,(%r8) 3117 // 41 85 01 test %eax,(%r9) 3118 // 41 85 02 test %eax,(%r10) 3119 // 41 85 03 test %eax,(%r11) 3120 // 41 85 06 test %eax,(%r14) 3121 // 41 85 07 test %eax,(%r15) 3122 // 3123 // 85 04 24 test %eax,(%rsp) 3124 // 41 85 04 24 test %eax,(%r12) 3125 // 85 45 00 test %eax,0x0(%rbp) 3126 // 41 85 45 00 test %eax,0x0(%r13) 3127 3128 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix); 3129 __ jcc(Assembler::notEqual, no_prefix); 3130 __ addptr(rbx, 1); 3131 __ bind(no_prefix); 3132 #ifdef ASSERT 3133 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below 3134 #endif 3135 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values: 3136 // r12/rsp 0x04 3137 // r13/rbp 0x05 3138 __ movzbq(rcx, Address(rbx, 1)); 3139 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05 3140 __ subptr(rcx, 4); // looking for 0x00 .. 0x01 3141 __ cmpptr(rcx, 1); 3142 __ jcc(Assembler::above, not_special); 3143 __ addptr(rbx, 1); 3144 __ bind(not_special); 3145 #ifdef ASSERT 3146 // Verify the correct encoding of the poll we're about to skip. 3147 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl); 3148 __ jcc(Assembler::notEqual, bail); 3149 // Mask out the modrm bits 3150 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask); 3151 // rax encodes to 0, so if the bits are nonzero it's incorrect 3152 __ jcc(Assembler::notZero, bail); 3153 #endif 3154 // Adjust return pc forward to step over the safepoint poll instruction 3155 __ addptr(rbx, 2); 3156 __ movptr(Address(rbp, wordSize), rbx); 3157 } 3158 3159 __ bind(no_adjust); 3160 // Normal exit, restore registers and exit. 3161 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3162 __ ret(0); 3163 3164 #ifdef ASSERT 3165 __ bind(bail); 3166 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected"); 3167 #endif 3168 3169 // Make sure all code is generated 3170 masm->flush(); 3171 3172 // Fill-out other meta info 3173 return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words); 3174 } 3175 3176 // 3177 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss 3178 // 3179 // Generate a stub that calls into vm to find out the proper destination 3180 // of a java call. All the argument registers are live at this point 3181 // but since this is generic code we don't know what they are and the caller 3182 // must do any gc of the args. 3183 // 3184 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) { 3185 assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before"); 3186 3187 // allocate space for the code 3188 ResourceMark rm; 3189 3190 CodeBuffer buffer(name, 1000, 512); 3191 MacroAssembler* masm = new MacroAssembler(&buffer); 3192 3193 int frame_size_in_words; 3194 3195 OopMapSet *oop_maps = new OopMapSet(); 3196 OopMap* map = NULL; 3197 3198 int start = __ offset(); 3199 3200 // No need to save vector registers since they are caller-saved anyway. 3201 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false); 3202 3203 int frame_complete = __ offset(); 3204 3205 __ set_last_Java_frame(noreg, noreg, NULL); 3206 3207 __ mov(c_rarg0, r15_thread); 3208 3209 __ call(RuntimeAddress(destination)); 3210 3211 3212 // Set an oopmap for the call site. 3213 // We need this not only for callee-saved registers, but also for volatile 3214 // registers that the compiler might be keeping live across a safepoint. 3215 3216 oop_maps->add_gc_map( __ offset() - start, map); 3217 3218 // rax contains the address we are going to jump to assuming no exception got installed 3219 3220 // clear last_Java_sp 3221 __ reset_last_Java_frame(false); 3222 // check for pending exceptions 3223 Label pending; 3224 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD); 3225 __ jcc(Assembler::notEqual, pending); 3226 3227 // get the returned Method* 3228 __ get_vm_result_2(rbx, r15_thread); 3229 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx); 3230 3231 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 3232 3233 RegisterSaver::restore_live_registers(masm); 3234 3235 // We are back the the original state on entry and ready to go. 3236 3237 __ jmp(rax); 3238 3239 // Pending exception after the safepoint 3240 3241 __ bind(pending); 3242 3243 RegisterSaver::restore_live_registers(masm); 3244 3245 // exception pending => remove activation and forward to exception handler 3246 3247 __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD); 3248 3249 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 3250 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3251 3252 // ------------- 3253 // make sure all code is generated 3254 masm->flush(); 3255 3256 // return the blob 3257 // frame_size_words or bytes?? 3258 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); 3259 } 3260 3261 #ifdef COMPILER2 3262 static const int native_invoker_code_size = MethodHandles::adapter_code_size; 3263 3264 class NativeInvokerGenerator : public StubCodeGenerator { 3265 address _call_target; 3266 int _shadow_space_bytes; 3267 3268 const GrowableArray<VMReg>& _input_registers; 3269 const GrowableArray<VMReg>& _output_registers; 3270 3271 int _frame_complete; 3272 int _framesize; 3273 OopMapSet* _oop_maps; 3274 public: 3275 NativeInvokerGenerator(CodeBuffer* buffer, 3276 address call_target, 3277 int shadow_space_bytes, 3278 const GrowableArray<VMReg>& input_registers, 3279 const GrowableArray<VMReg>& output_registers) 3280 : StubCodeGenerator(buffer, PrintMethodHandleStubs), 3281 _call_target(call_target), 3282 _shadow_space_bytes(shadow_space_bytes), 3283 _input_registers(input_registers), 3284 _output_registers(output_registers), 3285 _frame_complete(0), 3286 _framesize(0), 3287 _oop_maps(NULL) { 3288 assert(_output_registers.length() <= 1 3289 || (_output_registers.length() == 2 && !_output_registers.at(1)->is_valid()), "no multi-reg returns"); 3290 3291 } 3292 3293 void generate(); 3294 3295 int spill_size_in_bytes() const { 3296 if (_output_registers.length() == 0) { 3297 return 0; 3298 } 3299 VMReg reg = _output_registers.at(0); 3300 assert(reg->is_reg(), "must be a register"); 3301 if (reg->is_Register()) { 3302 return 8; 3303 } else if (reg->is_XMMRegister()) { 3304 if (UseAVX >= 3) { 3305 return 64; 3306 } else if (UseAVX >= 1) { 3307 return 32; 3308 } else { 3309 return 16; 3310 } 3311 } else { 3312 ShouldNotReachHere(); 3313 } 3314 return 0; 3315 } 3316 3317 void spill_out_registers() { 3318 if (_output_registers.length() == 0) { 3319 return; 3320 } 3321 VMReg reg = _output_registers.at(0); 3322 assert(reg->is_reg(), "must be a register"); 3323 MacroAssembler* masm = _masm; 3324 if (reg->is_Register()) { 3325 __ movptr(Address(rsp, 0), reg->as_Register()); 3326 } else if (reg->is_XMMRegister()) { 3327 if (UseAVX >= 3) { 3328 __ evmovdqul(Address(rsp, 0), reg->as_XMMRegister(), Assembler::AVX_512bit); 3329 } else if (UseAVX >= 1) { 3330 __ vmovdqu(Address(rsp, 0), reg->as_XMMRegister()); 3331 } else { 3332 __ movdqu(Address(rsp, 0), reg->as_XMMRegister()); 3333 } 3334 } else { 3335 ShouldNotReachHere(); 3336 } 3337 } 3338 3339 void fill_out_registers() { 3340 if (_output_registers.length() == 0) { 3341 return; 3342 } 3343 VMReg reg = _output_registers.at(0); 3344 assert(reg->is_reg(), "must be a register"); 3345 MacroAssembler* masm = _masm; 3346 if (reg->is_Register()) { 3347 __ movptr(reg->as_Register(), Address(rsp, 0)); 3348 } else if (reg->is_XMMRegister()) { 3349 if (UseAVX >= 3) { 3350 __ evmovdqul(reg->as_XMMRegister(), Address(rsp, 0), Assembler::AVX_512bit); 3351 } else if (UseAVX >= 1) { 3352 __ vmovdqu(reg->as_XMMRegister(), Address(rsp, 0)); 3353 } else { 3354 __ movdqu(reg->as_XMMRegister(), Address(rsp, 0)); 3355 } 3356 } else { 3357 ShouldNotReachHere(); 3358 } 3359 } 3360 3361 int frame_complete() const { 3362 return _frame_complete; 3363 } 3364 3365 int framesize() const { 3366 return (_framesize >> (LogBytesPerWord - LogBytesPerInt)); 3367 } 3368 3369 OopMapSet* oop_maps() const { 3370 return _oop_maps; 3371 } 3372 3373 private: 3374 #ifdef ASSERT 3375 bool target_uses_register(VMReg reg) { 3376 return _input_registers.contains(reg) || _output_registers.contains(reg); 3377 } 3378 #endif 3379 }; 3380 3381 RuntimeStub* SharedRuntime::make_native_invoker(address call_target, 3382 int shadow_space_bytes, 3383 const GrowableArray<VMReg>& input_registers, 3384 const GrowableArray<VMReg>& output_registers) { 3385 int locs_size = 64; 3386 CodeBuffer code("nep_invoker_blob", native_invoker_code_size, locs_size); 3387 NativeInvokerGenerator g(&code, call_target, shadow_space_bytes, input_registers, output_registers); 3388 g.generate(); 3389 code.log_section_sizes("nep_invoker_blob"); 3390 3391 RuntimeStub* stub = 3392 RuntimeStub::new_runtime_stub("nep_invoker_blob", 3393 &code, 3394 g.frame_complete(), 3395 g.framesize(), 3396 g.oop_maps(), false); 3397 return stub; 3398 } 3399 3400 void NativeInvokerGenerator::generate() { 3401 assert(!(target_uses_register(r15_thread->as_VMReg()) || target_uses_register(rscratch1->as_VMReg())), "Register conflict"); 3402 3403 enum layout { 3404 rbp_off, 3405 rbp_off2, 3406 return_off, 3407 return_off2, 3408 framesize // inclusive of return address 3409 }; 3410 3411 _framesize = align_up(framesize + ((_shadow_space_bytes + spill_size_in_bytes()) >> LogBytesPerInt), 4); 3412 assert(is_even(_framesize/2), "sp not 16-byte aligned"); 3413 3414 _oop_maps = new OopMapSet(); 3415 MacroAssembler* masm = _masm; 3416 3417 address start = __ pc(); 3418 3419 __ enter(); 3420 3421 // return address and rbp are already in place 3422 __ subptr(rsp, (_framesize-4) << LogBytesPerInt); // prolog 3423 3424 _frame_complete = __ pc() - start; 3425 3426 address the_pc = __ pc(); 3427 3428 __ set_last_Java_frame(rsp, rbp, (address)the_pc); 3429 OopMap* map = new OopMap(_framesize, 0); 3430 _oop_maps->add_gc_map(the_pc - start, map); 3431 3432 // State transition 3433 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 3434 3435 __ call(RuntimeAddress(_call_target)); 3436 3437 __ restore_cpu_control_state_after_jni(); 3438 3439 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 3440 3441 // Force this write out before the read below 3442 __ membar(Assembler::Membar_mask_bits( 3443 Assembler::LoadLoad | Assembler::LoadStore | 3444 Assembler::StoreLoad | Assembler::StoreStore)); 3445 3446 Label L_after_safepoint_poll; 3447 Label L_safepoint_poll_slow_path; 3448 3449 __ safepoint_poll(L_safepoint_poll_slow_path, r15_thread, true /* at_return */, false /* in_nmethod */); 3450 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 3451 __ jcc(Assembler::notEqual, L_safepoint_poll_slow_path); 3452 3453 __ bind(L_after_safepoint_poll); 3454 3455 // change thread state 3456 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 3457 3458 __ block_comment("reguard stack check"); 3459 Label L_reguard; 3460 Label L_after_reguard; 3461 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 3462 __ jcc(Assembler::equal, L_reguard); 3463 __ bind(L_after_reguard); 3464 3465 __ reset_last_Java_frame(r15_thread, true); 3466 3467 __ leave(); // required for proper stackwalking of RuntimeStub frame 3468 __ ret(0); 3469 3470 ////////////////////////////////////////////////////////////////////////////// 3471 3472 __ block_comment("{ L_safepoint_poll_slow_path"); 3473 __ bind(L_safepoint_poll_slow_path); 3474 __ vzeroupper(); 3475 3476 spill_out_registers(); 3477 3478 __ mov(c_rarg0, r15_thread); 3479 __ mov(r12, rsp); // remember sp 3480 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 3481 __ andptr(rsp, -16); // align stack as required by ABI 3482 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 3483 __ mov(rsp, r12); // restore sp 3484 __ reinit_heapbase(); 3485 3486 fill_out_registers(); 3487 3488 __ jmp(L_after_safepoint_poll); 3489 __ block_comment("} L_safepoint_poll_slow_path"); 3490 3491 ////////////////////////////////////////////////////////////////////////////// 3492 3493 __ block_comment("{ L_reguard"); 3494 __ bind(L_reguard); 3495 __ vzeroupper(); 3496 3497 spill_out_registers(); 3498 3499 __ mov(r12, rsp); // remember sp 3500 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 3501 __ andptr(rsp, -16); // align stack as required by ABI 3502 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 3503 __ mov(rsp, r12); // restore sp 3504 __ reinit_heapbase(); 3505 3506 fill_out_registers(); 3507 3508 __ jmp(L_after_reguard); 3509 3510 __ block_comment("} L_reguard"); 3511 3512 ////////////////////////////////////////////////////////////////////////////// 3513 3514 __ flush(); 3515 } 3516 #endif // COMPILER2 3517 3518 //------------------------------Montgomery multiplication------------------------ 3519 // 3520 3521 #ifndef _WINDOWS 3522 3523 // Subtract 0:b from carry:a. Return carry. 3524 static julong 3525 sub(julong a[], julong b[], julong carry, long len) { 3526 long long i = 0, cnt = len; 3527 julong tmp; 3528 asm volatile("clc; " 3529 "0: ; " 3530 "mov (%[b], %[i], 8), %[tmp]; " 3531 "sbb %[tmp], (%[a], %[i], 8); " 3532 "inc %[i]; dec %[cnt]; " 3533 "jne 0b; " 3534 "mov %[carry], %[tmp]; sbb $0, %[tmp]; " 3535 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp) 3536 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry) 3537 : "memory"); 3538 return tmp; 3539 } 3540 3541 // Multiply (unsigned) Long A by Long B, accumulating the double- 3542 // length result into the accumulator formed of T0, T1, and T2. 3543 #define MACC(A, B, T0, T1, T2) \ 3544 do { \ 3545 unsigned long hi, lo; \ 3546 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3547 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3548 : "r"(A), "a"(B) : "cc"); \ 3549 } while(0) 3550 3551 // As above, but add twice the double-length result into the 3552 // accumulator. 3553 #define MACC2(A, B, T0, T1, T2) \ 3554 do { \ 3555 unsigned long hi, lo; \ 3556 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \ 3557 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3558 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3559 : "r"(A), "a"(B) : "cc"); \ 3560 } while(0) 3561 3562 #else //_WINDOWS 3563 3564 static julong 3565 sub(julong a[], julong b[], julong carry, long len) { 3566 long i; 3567 julong tmp; 3568 unsigned char c = 1; 3569 for (i = 0; i < len; i++) { 3570 c = _addcarry_u64(c, a[i], ~b[i], &tmp); 3571 a[i] = tmp; 3572 } 3573 c = _addcarry_u64(c, carry, ~0, &tmp); 3574 return tmp; 3575 } 3576 3577 // Multiply (unsigned) Long A by Long B, accumulating the double- 3578 // length result into the accumulator formed of T0, T1, and T2. 3579 #define MACC(A, B, T0, T1, T2) \ 3580 do { \ 3581 julong hi, lo; \ 3582 lo = _umul128(A, B, &hi); \ 3583 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3584 c = _addcarry_u64(c, hi, T1, &T1); \ 3585 _addcarry_u64(c, T2, 0, &T2); \ 3586 } while(0) 3587 3588 // As above, but add twice the double-length result into the 3589 // accumulator. 3590 #define MACC2(A, B, T0, T1, T2) \ 3591 do { \ 3592 julong hi, lo; \ 3593 lo = _umul128(A, B, &hi); \ 3594 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3595 c = _addcarry_u64(c, hi, T1, &T1); \ 3596 _addcarry_u64(c, T2, 0, &T2); \ 3597 c = _addcarry_u64(0, lo, T0, &T0); \ 3598 c = _addcarry_u64(c, hi, T1, &T1); \ 3599 _addcarry_u64(c, T2, 0, &T2); \ 3600 } while(0) 3601 3602 #endif //_WINDOWS 3603 3604 // Fast Montgomery multiplication. The derivation of the algorithm is 3605 // in A Cryptographic Library for the Motorola DSP56000, 3606 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3607 3608 static void NOINLINE 3609 montgomery_multiply(julong a[], julong b[], julong n[], 3610 julong m[], julong inv, int len) { 3611 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3612 int i; 3613 3614 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply"); 3615 3616 for (i = 0; i < len; i++) { 3617 int j; 3618 for (j = 0; j < i; j++) { 3619 MACC(a[j], b[i-j], t0, t1, t2); 3620 MACC(m[j], n[i-j], t0, t1, t2); 3621 } 3622 MACC(a[i], b[0], t0, t1, t2); 3623 m[i] = t0 * inv; 3624 MACC(m[i], n[0], t0, t1, t2); 3625 3626 assert(t0 == 0, "broken Montgomery multiply"); 3627 3628 t0 = t1; t1 = t2; t2 = 0; 3629 } 3630 3631 for (i = len; i < 2*len; i++) { 3632 int j; 3633 for (j = i-len+1; j < len; j++) { 3634 MACC(a[j], b[i-j], t0, t1, t2); 3635 MACC(m[j], n[i-j], t0, t1, t2); 3636 } 3637 m[i-len] = t0; 3638 t0 = t1; t1 = t2; t2 = 0; 3639 } 3640 3641 while (t0) 3642 t0 = sub(m, n, t0, len); 3643 } 3644 3645 // Fast Montgomery squaring. This uses asymptotically 25% fewer 3646 // multiplies so it should be up to 25% faster than Montgomery 3647 // multiplication. However, its loop control is more complex and it 3648 // may actually run slower on some machines. 3649 3650 static void NOINLINE 3651 montgomery_square(julong a[], julong n[], 3652 julong m[], julong inv, int len) { 3653 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3654 int i; 3655 3656 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square"); 3657 3658 for (i = 0; i < len; i++) { 3659 int j; 3660 int end = (i+1)/2; 3661 for (j = 0; j < end; j++) { 3662 MACC2(a[j], a[i-j], t0, t1, t2); 3663 MACC(m[j], n[i-j], t0, t1, t2); 3664 } 3665 if ((i & 1) == 0) { 3666 MACC(a[j], a[j], t0, t1, t2); 3667 } 3668 for (; j < i; j++) { 3669 MACC(m[j], n[i-j], t0, t1, t2); 3670 } 3671 m[i] = t0 * inv; 3672 MACC(m[i], n[0], t0, t1, t2); 3673 3674 assert(t0 == 0, "broken Montgomery square"); 3675 3676 t0 = t1; t1 = t2; t2 = 0; 3677 } 3678 3679 for (i = len; i < 2*len; i++) { 3680 int start = i-len+1; 3681 int end = start + (len - start)/2; 3682 int j; 3683 for (j = start; j < end; j++) { 3684 MACC2(a[j], a[i-j], t0, t1, t2); 3685 MACC(m[j], n[i-j], t0, t1, t2); 3686 } 3687 if ((i & 1) == 0) { 3688 MACC(a[j], a[j], t0, t1, t2); 3689 } 3690 for (; j < len; j++) { 3691 MACC(m[j], n[i-j], t0, t1, t2); 3692 } 3693 m[i-len] = t0; 3694 t0 = t1; t1 = t2; t2 = 0; 3695 } 3696 3697 while (t0) 3698 t0 = sub(m, n, t0, len); 3699 } 3700 3701 // Swap words in a longword. 3702 static julong swap(julong x) { 3703 return (x << 32) | (x >> 32); 3704 } 3705 3706 // Copy len longwords from s to d, word-swapping as we go. The 3707 // destination array is reversed. 3708 static void reverse_words(julong *s, julong *d, int len) { 3709 d += len; 3710 while(len-- > 0) { 3711 d--; 3712 *d = swap(*s); 3713 s++; 3714 } 3715 } 3716 3717 // The threshold at which squaring is advantageous was determined 3718 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. 3719 #define MONTGOMERY_SQUARING_THRESHOLD 64 3720 3721 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, 3722 jint len, jlong inv, 3723 jint *m_ints) { 3724 assert(len % 2 == 0, "array length in montgomery_multiply must be even"); 3725 int longwords = len/2; 3726 3727 // Make very sure we don't use so much space that the stack might 3728 // overflow. 512 jints corresponds to an 16384-bit integer and 3729 // will use here a total of 8k bytes of stack space. 3730 int total_allocation = longwords * sizeof (julong) * 4; 3731 guarantee(total_allocation <= 8192, "must be"); 3732 julong *scratch = (julong *)alloca(total_allocation); 3733 3734 // Local scratch arrays 3735 julong 3736 *a = scratch + 0 * longwords, 3737 *b = scratch + 1 * longwords, 3738 *n = scratch + 2 * longwords, 3739 *m = scratch + 3 * longwords; 3740 3741 reverse_words((julong *)a_ints, a, longwords); 3742 reverse_words((julong *)b_ints, b, longwords); 3743 reverse_words((julong *)n_ints, n, longwords); 3744 3745 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords); 3746 3747 reverse_words(m, (julong *)m_ints, longwords); 3748 } 3749 3750 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, 3751 jint len, jlong inv, 3752 jint *m_ints) { 3753 assert(len % 2 == 0, "array length in montgomery_square must be even"); 3754 int longwords = len/2; 3755 3756 // Make very sure we don't use so much space that the stack might 3757 // overflow. 512 jints corresponds to an 16384-bit integer and 3758 // will use here a total of 6k bytes of stack space. 3759 int total_allocation = longwords * sizeof (julong) * 3; 3760 guarantee(total_allocation <= 8192, "must be"); 3761 julong *scratch = (julong *)alloca(total_allocation); 3762 3763 // Local scratch arrays 3764 julong 3765 *a = scratch + 0 * longwords, 3766 *n = scratch + 1 * longwords, 3767 *m = scratch + 2 * longwords; 3768 3769 reverse_words((julong *)a_ints, a, longwords); 3770 reverse_words((julong *)n_ints, n, longwords); 3771 3772 if (len >= MONTGOMERY_SQUARING_THRESHOLD) { 3773 ::montgomery_square(a, n, m, (julong)inv, longwords); 3774 } else { 3775 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords); 3776 } 3777 3778 reverse_words(m, (julong *)m_ints, longwords); 3779 } 3780 3781 #ifdef COMPILER2 3782 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame 3783 // 3784 //------------------------------generate_exception_blob--------------------------- 3785 // creates exception blob at the end 3786 // Using exception blob, this code is jumped from a compiled method. 3787 // (see emit_exception_handler in x86_64.ad file) 3788 // 3789 // Given an exception pc at a call we call into the runtime for the 3790 // handler in this method. This handler might merely restore state 3791 // (i.e. callee save registers) unwind the frame and jump to the 3792 // exception handler for the nmethod if there is no Java level handler 3793 // for the nmethod. 3794 // 3795 // This code is entered with a jmp. 3796 // 3797 // Arguments: 3798 // rax: exception oop 3799 // rdx: exception pc 3800 // 3801 // Results: 3802 // rax: exception oop 3803 // rdx: exception pc in caller or ??? 3804 // destination: exception handler of caller 3805 // 3806 // Note: the exception pc MUST be at a call (precise debug information) 3807 // Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved. 3808 // 3809 3810 void OptoRuntime::generate_exception_blob() { 3811 assert(!OptoRuntime::is_callee_saved_register(RDX_num), ""); 3812 assert(!OptoRuntime::is_callee_saved_register(RAX_num), ""); 3813 assert(!OptoRuntime::is_callee_saved_register(RCX_num), ""); 3814 3815 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 3816 3817 // Allocate space for the code 3818 ResourceMark rm; 3819 // Setup code generation tools 3820 CodeBuffer buffer("exception_blob", 2048, 1024); 3821 MacroAssembler* masm = new MacroAssembler(&buffer); 3822 3823 3824 address start = __ pc(); 3825 3826 // Exception pc is 'return address' for stack walker 3827 __ push(rdx); 3828 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog 3829 3830 // Save callee-saved registers. See x86_64.ad. 3831 3832 // rbp is an implicitly saved callee saved register (i.e., the calling 3833 // convention will save/restore it in the prolog/epilog). Other than that 3834 // there are no callee save registers now that adapter frames are gone. 3835 3836 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 3837 3838 // Store exception in Thread object. We cannot pass any arguments to the 3839 // handle_exception call, since we do not want to make any assumption 3840 // about the size of the frame where the exception happened in. 3841 // c_rarg0 is either rdi (Linux) or rcx (Windows). 3842 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax); 3843 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 3844 3845 // This call does all the hard work. It checks if an exception handler 3846 // exists in the method. 3847 // If so, it returns the handler address. 3848 // If not, it prepares for stack-unwinding, restoring the callee-save 3849 // registers of the frame being removed. 3850 // 3851 // address OptoRuntime::handle_exception_C(JavaThread* thread) 3852 3853 // At a method handle call, the stack may not be properly aligned 3854 // when returning with an exception. 3855 address the_pc = __ pc(); 3856 __ set_last_Java_frame(noreg, noreg, the_pc); 3857 __ mov(c_rarg0, r15_thread); 3858 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack 3859 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C))); 3860 3861 // Set an oopmap for the call site. This oopmap will only be used if we 3862 // are unwinding the stack. Hence, all locations will be dead. 3863 // Callee-saved registers will be the same as the frame above (i.e., 3864 // handle_exception_stub), since they were restored when we got the 3865 // exception. 3866 3867 OopMapSet* oop_maps = new OopMapSet(); 3868 3869 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 3870 3871 __ reset_last_Java_frame(false); 3872 3873 // Restore callee-saved registers 3874 3875 // rbp is an implicitly saved callee-saved register (i.e., the calling 3876 // convention will save restore it in prolog/epilog) Other than that 3877 // there are no callee save registers now that adapter frames are gone. 3878 3879 __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt)); 3880 3881 __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog 3882 __ pop(rdx); // No need for exception pc anymore 3883 3884 // rax: exception handler 3885 3886 // We have a handler in rax (could be deopt blob). 3887 __ mov(r8, rax); 3888 3889 // Get the exception oop 3890 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 3891 // Get the exception pc in case we are deoptimized 3892 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 3893 #ifdef ASSERT 3894 __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD); 3895 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD); 3896 #endif 3897 // Clear the exception oop so GC no longer processes it as a root. 3898 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD); 3899 3900 // rax: exception oop 3901 // r8: exception handler 3902 // rdx: exception pc 3903 // Jump to handler 3904 3905 __ jmp(r8); 3906 3907 // Make sure all code is generated 3908 masm->flush(); 3909 3910 // Set exception blob 3911 _exception_blob = ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1); 3912 } 3913 #endif // COMPILER2 3914 3915 void SharedRuntime::compute_move_order(const BasicType* in_sig_bt, 3916 int total_in_args, const VMRegPair* in_regs, 3917 int total_out_args, VMRegPair* out_regs, 3918 GrowableArray<int>& arg_order, 3919 VMRegPair tmp_vmreg) { 3920 ComputeMoveOrder order(total_in_args, in_regs, 3921 total_out_args, out_regs, 3922 in_sig_bt, arg_order, tmp_vmreg); 3923 }