1 /* 2 * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #ifndef _WINDOWS 27 #include "alloca.h" 28 #endif 29 #include "asm/macroAssembler.hpp" 30 #include "asm/macroAssembler.inline.hpp" 31 #include "code/debugInfoRec.hpp" 32 #include "code/icBuffer.hpp" 33 #include "code/nativeInst.hpp" 34 #include "code/vtableStubs.hpp" 35 #include "compiler/oopMap.hpp" 36 #include "gc/shared/collectedHeap.hpp" 37 #include "gc/shared/gcLocker.hpp" 38 #include "gc/shared/barrierSet.hpp" 39 #include "gc/shared/barrierSetAssembler.hpp" 40 #include "interpreter/interpreter.hpp" 41 #include "logging/log.hpp" 42 #include "memory/resourceArea.hpp" 43 #include "memory/universe.hpp" 44 #include "oops/compiledICHolder.hpp" 45 #include "oops/klass.inline.hpp" 46 #include "prims/methodHandles.hpp" 47 #include "runtime/jniHandles.hpp" 48 #include "runtime/safepointMechanism.hpp" 49 #include "runtime/sharedRuntime.hpp" 50 #include "runtime/signature.hpp" 51 #include "runtime/stubRoutines.hpp" 52 #include "runtime/vframeArray.hpp" 53 #include "runtime/vm_version.hpp" 54 #include "utilities/align.hpp" 55 #include "utilities/formatBuffer.hpp" 56 #include "vmreg_x86.inline.hpp" 57 #ifdef COMPILER1 58 #include "c1/c1_Runtime1.hpp" 59 #endif 60 #ifdef COMPILER2 61 #include "opto/runtime.hpp" 62 #endif 63 #if INCLUDE_JVMCI 64 #include "jvmci/jvmciJavaClasses.hpp" 65 #endif 66 67 #define __ masm-> 68 69 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size; 70 71 class SimpleRuntimeFrame { 72 73 public: 74 75 // Most of the runtime stubs have this simple frame layout. 76 // This class exists to make the layout shared in one place. 77 // Offsets are for compiler stack slots, which are jints. 78 enum layout { 79 // The frame sender code expects that rbp will be in the "natural" place and 80 // will override any oopMap setting for it. We must therefore force the layout 81 // so that it agrees with the frame sender code. 82 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt, 83 rbp_off2, 84 return_off, return_off2, 85 framesize 86 }; 87 }; 88 89 class RegisterSaver { 90 // Capture info about frame layout. Layout offsets are in jint 91 // units because compiler frame slots are jints. 92 #define XSAVE_AREA_BEGIN 160 93 #define XSAVE_AREA_YMM_BEGIN 576 94 #define XSAVE_AREA_OPMASK_BEGIN 1088 95 #define XSAVE_AREA_ZMM_BEGIN 1152 96 #define XSAVE_AREA_UPPERBANK 1664 97 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off 98 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off 99 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off 100 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off 101 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off 102 enum layout { 103 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area 104 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area 105 DEF_XMM_OFFS(0), 106 DEF_XMM_OFFS(1), 107 // 2..15 are implied in range usage 108 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 109 DEF_YMM_OFFS(0), 110 DEF_YMM_OFFS(1), 111 // 2..15 are implied in range usage 112 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 113 DEF_OPMASK_OFFS(0), 114 DEF_OPMASK_OFFS(1), 115 // 2..7 are implied in range usage 116 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 117 DEF_ZMM_OFFS(0), 118 DEF_ZMM_OFFS(1), 119 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt, 120 DEF_ZMM_UPPER_OFFS(16), 121 DEF_ZMM_UPPER_OFFS(17), 122 // 18..31 are implied in range usage 123 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt), 124 fpu_stateH_end, 125 r15_off, r15H_off, 126 r14_off, r14H_off, 127 r13_off, r13H_off, 128 r12_off, r12H_off, 129 r11_off, r11H_off, 130 r10_off, r10H_off, 131 r9_off, r9H_off, 132 r8_off, r8H_off, 133 rdi_off, rdiH_off, 134 rsi_off, rsiH_off, 135 ignore_off, ignoreH_off, // extra copy of rbp 136 rsp_off, rspH_off, 137 rbx_off, rbxH_off, 138 rdx_off, rdxH_off, 139 rcx_off, rcxH_off, 140 rax_off, raxH_off, 141 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state 142 align_off, alignH_off, 143 flags_off, flagsH_off, 144 // The frame sender code expects that rbp will be in the "natural" place and 145 // will override any oopMap setting for it. We must therefore force the layout 146 // so that it agrees with the frame sender code. 147 rbp_off, rbpH_off, // copy of rbp we will restore 148 return_off, returnH_off, // slot for return address 149 reg_save_size // size in compiler stack slots 150 }; 151 152 public: 153 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors); 154 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false); 155 156 // Offsets into the register save area 157 // Used by deoptimization when it is managing result register 158 // values on its own 159 160 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; } 161 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; } 162 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; } 163 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; } 164 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; } 165 166 // During deoptimization only the result registers need to be restored, 167 // all the other values have already been extracted. 168 static void restore_result_registers(MacroAssembler* masm); 169 }; 170 171 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) { 172 int off = 0; 173 int num_xmm_regs = XMMRegisterImpl::number_of_registers; 174 if (UseAVX < 3) { 175 num_xmm_regs = num_xmm_regs/2; 176 } 177 #if COMPILER2_OR_JVMCI 178 if (save_wide_vectors && UseAVX == 0) { 179 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX 180 } 181 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 182 #else 183 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI 184 #endif 185 186 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated 187 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs); 188 // OopMap frame size is in compiler stack slots (jint's) not bytes or words 189 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 190 // CodeBlob frame size is in words. 191 int frame_size_in_words = frame_size_in_bytes / wordSize; 192 *total_frame_words = frame_size_in_words; 193 194 // Save registers, fpu state, and flags. 195 // We assume caller has already pushed the return address onto the 196 // stack, so rsp is 8-byte aligned here. 197 // We push rpb twice in this sequence because we want the real rbp 198 // to be under the return like a normal enter. 199 200 __ enter(); // rsp becomes 16-byte aligned here 201 __ push_CPU_state(); // Push a multiple of 16 bytes 202 203 // push cpu state handles this on EVEX enabled targets 204 if (save_wide_vectors) { 205 // Save upper half of YMM registers(0..15) 206 int base_addr = XSAVE_AREA_YMM_BEGIN; 207 for (int n = 0; n < 16; n++) { 208 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n)); 209 } 210 if (VM_Version::supports_evex()) { 211 // Save upper half of ZMM registers(0..15) 212 base_addr = XSAVE_AREA_ZMM_BEGIN; 213 for (int n = 0; n < 16; n++) { 214 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n)); 215 } 216 // Save full ZMM registers(16..num_xmm_regs) 217 base_addr = XSAVE_AREA_UPPERBANK; 218 off = 0; 219 int vector_len = Assembler::AVX_512bit; 220 for (int n = 16; n < num_xmm_regs; n++) { 221 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 222 } 223 #if COMPILER2_OR_JVMCI 224 base_addr = XSAVE_AREA_OPMASK_BEGIN; 225 off = 0; 226 for(int n = 0; n < KRegisterImpl::number_of_registers; n++) { 227 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 228 } 229 #endif 230 } 231 } else { 232 if (VM_Version::supports_evex()) { 233 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 234 int base_addr = XSAVE_AREA_UPPERBANK; 235 off = 0; 236 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 237 for (int n = 16; n < num_xmm_regs; n++) { 238 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 239 } 240 #if COMPILER2_OR_JVMCI 241 base_addr = XSAVE_AREA_OPMASK_BEGIN; 242 off = 0; 243 for(int n = 0; n < KRegisterImpl::number_of_registers; n++) { 244 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 245 } 246 #endif 247 } 248 } 249 __ vzeroupper(); 250 if (frame::arg_reg_save_area_bytes != 0) { 251 // Allocate argument register save area 252 __ subptr(rsp, frame::arg_reg_save_area_bytes); 253 } 254 255 // Set an oopmap for the call site. This oopmap will map all 256 // oop-registers and debug-info registers as callee-saved. This 257 // will allow deoptimization at this safepoint to find all possible 258 // debug-info recordings, as well as let GC find all oops. 259 260 OopMapSet *oop_maps = new OopMapSet(); 261 OopMap* map = new OopMap(frame_size_in_slots, 0); 262 263 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x)) 264 265 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg()); 266 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg()); 267 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg()); 268 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg()); 269 // rbp location is known implicitly by the frame sender code, needs no oopmap 270 // and the location where rbp was saved by is ignored 271 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg()); 272 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg()); 273 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg()); 274 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg()); 275 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg()); 276 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg()); 277 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg()); 278 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg()); 279 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg()); 280 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg()); 281 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 282 // on EVEX enabled targets, we get it included in the xsave area 283 off = xmm0_off; 284 int delta = xmm1_off - off; 285 for (int n = 0; n < 16; n++) { 286 XMMRegister xmm_name = as_XMMRegister(n); 287 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()); 288 off += delta; 289 } 290 if (UseAVX > 2) { 291 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 292 off = zmm16_off; 293 delta = zmm17_off - off; 294 for (int n = 16; n < num_xmm_regs; n++) { 295 XMMRegister zmm_name = as_XMMRegister(n); 296 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()); 297 off += delta; 298 } 299 } 300 301 #if COMPILER2_OR_JVMCI 302 if (save_wide_vectors) { 303 // Save upper half of YMM registers(0..15) 304 off = ymm0_off; 305 delta = ymm1_off - ymm0_off; 306 for (int n = 0; n < 16; n++) { 307 XMMRegister ymm_name = as_XMMRegister(n); 308 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4)); 309 off += delta; 310 } 311 if (VM_Version::supports_evex()) { 312 // Save upper half of ZMM registers(0..15) 313 off = zmm0_off; 314 delta = zmm1_off - zmm0_off; 315 for (int n = 0; n < 16; n++) { 316 XMMRegister zmm_name = as_XMMRegister(n); 317 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8)); 318 off += delta; 319 } 320 } 321 } 322 #endif // COMPILER2_OR_JVMCI 323 324 // %%% These should all be a waste but we'll keep things as they were for now 325 if (true) { 326 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next()); 327 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next()); 328 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next()); 329 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next()); 330 // rbp location is known implicitly by the frame sender code, needs no oopmap 331 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next()); 332 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next()); 333 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next()); 334 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next()); 335 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next()); 336 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next()); 337 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next()); 338 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next()); 339 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next()); 340 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next()); 341 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 342 // on EVEX enabled targets, we get it included in the xsave area 343 off = xmm0H_off; 344 delta = xmm1H_off - off; 345 for (int n = 0; n < 16; n++) { 346 XMMRegister xmm_name = as_XMMRegister(n); 347 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next()); 348 off += delta; 349 } 350 if (UseAVX > 2) { 351 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 352 off = zmm16H_off; 353 delta = zmm17H_off - off; 354 for (int n = 16; n < num_xmm_regs; n++) { 355 XMMRegister zmm_name = as_XMMRegister(n); 356 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next()); 357 off += delta; 358 } 359 } 360 } 361 362 return map; 363 } 364 365 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) { 366 int num_xmm_regs = XMMRegisterImpl::number_of_registers; 367 if (UseAVX < 3) { 368 num_xmm_regs = num_xmm_regs/2; 369 } 370 if (frame::arg_reg_save_area_bytes != 0) { 371 // Pop arg register save area 372 __ addptr(rsp, frame::arg_reg_save_area_bytes); 373 } 374 375 #if COMPILER2_OR_JVMCI 376 if (restore_wide_vectors) { 377 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); 378 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 379 } 380 #else 381 assert(!restore_wide_vectors, "vectors are generated only by C2"); 382 #endif 383 384 __ vzeroupper(); 385 386 // On EVEX enabled targets everything is handled in pop fpu state 387 if (restore_wide_vectors) { 388 // Restore upper half of YMM registers (0..15) 389 int base_addr = XSAVE_AREA_YMM_BEGIN; 390 for (int n = 0; n < 16; n++) { 391 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16)); 392 } 393 if (VM_Version::supports_evex()) { 394 // Restore upper half of ZMM registers (0..15) 395 base_addr = XSAVE_AREA_ZMM_BEGIN; 396 for (int n = 0; n < 16; n++) { 397 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32)); 398 } 399 // Restore full ZMM registers(16..num_xmm_regs) 400 base_addr = XSAVE_AREA_UPPERBANK; 401 int vector_len = Assembler::AVX_512bit; 402 int off = 0; 403 for (int n = 16; n < num_xmm_regs; n++) { 404 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 405 } 406 #if COMPILER2_OR_JVMCI 407 base_addr = XSAVE_AREA_OPMASK_BEGIN; 408 off = 0; 409 for (int n = 0; n < KRegisterImpl::number_of_registers; n++) { 410 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 411 } 412 #endif 413 } 414 } else { 415 if (VM_Version::supports_evex()) { 416 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 417 int base_addr = XSAVE_AREA_UPPERBANK; 418 int off = 0; 419 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 420 for (int n = 16; n < num_xmm_regs; n++) { 421 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 422 } 423 #if COMPILER2_OR_JVMCI 424 base_addr = XSAVE_AREA_OPMASK_BEGIN; 425 off = 0; 426 for (int n = 0; n < KRegisterImpl::number_of_registers; n++) { 427 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 428 } 429 #endif 430 } 431 } 432 433 // Recover CPU state 434 __ pop_CPU_state(); 435 // Get the rbp described implicitly by the calling convention (no oopMap) 436 __ pop(rbp); 437 } 438 439 void RegisterSaver::restore_result_registers(MacroAssembler* masm) { 440 441 // Just restore result register. Only used by deoptimization. By 442 // now any callee save register that needs to be restored to a c2 443 // caller of the deoptee has been extracted into the vframeArray 444 // and will be stuffed into the c2i adapter we create for later 445 // restoration so only result registers need to be restored here. 446 447 // Restore fp result register 448 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes())); 449 // Restore integer result register 450 __ movptr(rax, Address(rsp, rax_offset_in_bytes())); 451 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes())); 452 453 // Pop all of the register save are off the stack except the return address 454 __ addptr(rsp, return_offset_in_bytes()); 455 } 456 457 // Is vector's size (in bytes) bigger than a size saved by default? 458 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions. 459 bool SharedRuntime::is_wide_vector(int size) { 460 return size > 16; 461 } 462 463 // --------------------------------------------------------------------------- 464 // Read the array of BasicTypes from a signature, and compute where the 465 // arguments should go. Values in the VMRegPair regs array refer to 4-byte 466 // quantities. Values less than VMRegImpl::stack0 are registers, those above 467 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer 468 // as framesizes are fixed. 469 // VMRegImpl::stack0 refers to the first slot 0(sp). 470 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher. Register 471 // up to RegisterImpl::number_of_registers) are the 64-bit 472 // integer registers. 473 474 // Note: the INPUTS in sig_bt are in units of Java argument words, which are 475 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit 476 // units regardless of build. Of course for i486 there is no 64 bit build 477 478 // The Java calling convention is a "shifted" version of the C ABI. 479 // By skipping the first C ABI register we can call non-static jni methods 480 // with small numbers of arguments without having to shuffle the arguments 481 // at all. Since we control the java ABI we ought to at least get some 482 // advantage out of it. 483 484 int SharedRuntime::java_calling_convention(const BasicType *sig_bt, 485 VMRegPair *regs, 486 int total_args_passed) { 487 488 // Create the mapping between argument positions and 489 // registers. 490 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = { 491 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5 492 }; 493 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = { 494 j_farg0, j_farg1, j_farg2, j_farg3, 495 j_farg4, j_farg5, j_farg6, j_farg7 496 }; 497 498 499 uint int_args = 0; 500 uint fp_args = 0; 501 uint stk_args = 0; // inc by 2 each time 502 503 for (int i = 0; i < total_args_passed; i++) { 504 switch (sig_bt[i]) { 505 case T_BOOLEAN: 506 case T_CHAR: 507 case T_BYTE: 508 case T_SHORT: 509 case T_INT: 510 if (int_args < Argument::n_int_register_parameters_j) { 511 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 512 } else { 513 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 514 stk_args += 2; 515 } 516 break; 517 case T_VOID: 518 // halves of T_LONG or T_DOUBLE 519 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 520 regs[i].set_bad(); 521 break; 522 case T_LONG: 523 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 524 // fall through 525 case T_OBJECT: 526 case T_ARRAY: 527 case T_ADDRESS: 528 if (int_args < Argument::n_int_register_parameters_j) { 529 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 530 } else { 531 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 532 stk_args += 2; 533 } 534 break; 535 case T_FLOAT: 536 if (fp_args < Argument::n_float_register_parameters_j) { 537 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 538 } else { 539 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 540 stk_args += 2; 541 } 542 break; 543 case T_DOUBLE: 544 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 545 if (fp_args < Argument::n_float_register_parameters_j) { 546 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 547 } else { 548 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 549 stk_args += 2; 550 } 551 break; 552 default: 553 ShouldNotReachHere(); 554 break; 555 } 556 } 557 558 return align_up(stk_args, 2); 559 } 560 561 // Patch the callers callsite with entry to compiled code if it exists. 562 static void patch_callers_callsite(MacroAssembler *masm) { 563 Label L; 564 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD); 565 __ jcc(Assembler::equal, L); 566 567 // Save the current stack pointer 568 __ mov(r13, rsp); 569 // Schedule the branch target address early. 570 // Call into the VM to patch the caller, then jump to compiled callee 571 // rax isn't live so capture return address while we easily can 572 __ movptr(rax, Address(rsp, 0)); 573 574 // align stack so push_CPU_state doesn't fault 575 __ andptr(rsp, -(StackAlignmentInBytes)); 576 __ push_CPU_state(); 577 __ vzeroupper(); 578 // VM needs caller's callsite 579 // VM needs target method 580 // This needs to be a long call since we will relocate this adapter to 581 // the codeBuffer and it may not reach 582 583 // Allocate argument register save area 584 if (frame::arg_reg_save_area_bytes != 0) { 585 __ subptr(rsp, frame::arg_reg_save_area_bytes); 586 } 587 __ mov(c_rarg0, rbx); 588 __ mov(c_rarg1, rax); 589 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite))); 590 591 // De-allocate argument register save area 592 if (frame::arg_reg_save_area_bytes != 0) { 593 __ addptr(rsp, frame::arg_reg_save_area_bytes); 594 } 595 596 __ vzeroupper(); 597 __ pop_CPU_state(); 598 // restore sp 599 __ mov(rsp, r13); 600 __ bind(L); 601 } 602 603 604 static void gen_c2i_adapter(MacroAssembler *masm, 605 int total_args_passed, 606 int comp_args_on_stack, 607 const BasicType *sig_bt, 608 const VMRegPair *regs, 609 Label& skip_fixup) { 610 // Before we get into the guts of the C2I adapter, see if we should be here 611 // at all. We've come from compiled code and are attempting to jump to the 612 // interpreter, which means the caller made a static call to get here 613 // (vcalls always get a compiled target if there is one). Check for a 614 // compiled target. If there is one, we need to patch the caller's call. 615 patch_callers_callsite(masm); 616 617 __ bind(skip_fixup); 618 619 // Since all args are passed on the stack, total_args_passed * 620 // Interpreter::stackElementSize is the space we need. Plus 1 because 621 // we also account for the return address location since 622 // we store it first rather than hold it in rax across all the shuffling 623 624 int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize; 625 626 // stack is aligned, keep it that way 627 extraspace = align_up(extraspace, 2*wordSize); 628 629 // Get return address 630 __ pop(rax); 631 632 // set senderSP value 633 __ mov(r13, rsp); 634 635 __ subptr(rsp, extraspace); 636 637 // Store the return address in the expected location 638 __ movptr(Address(rsp, 0), rax); 639 640 // Now write the args into the outgoing interpreter space 641 for (int i = 0; i < total_args_passed; i++) { 642 if (sig_bt[i] == T_VOID) { 643 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 644 continue; 645 } 646 647 // offset to start parameters 648 int st_off = (total_args_passed - i) * Interpreter::stackElementSize; 649 int next_off = st_off - Interpreter::stackElementSize; 650 651 // Say 4 args: 652 // i st_off 653 // 0 32 T_LONG 654 // 1 24 T_VOID 655 // 2 16 T_OBJECT 656 // 3 8 T_BOOL 657 // - 0 return address 658 // 659 // However to make thing extra confusing. Because we can fit a long/double in 660 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter 661 // leaves one slot empty and only stores to a single slot. In this case the 662 // slot that is occupied is the T_VOID slot. See I said it was confusing. 663 664 VMReg r_1 = regs[i].first(); 665 VMReg r_2 = regs[i].second(); 666 if (!r_1->is_valid()) { 667 assert(!r_2->is_valid(), ""); 668 continue; 669 } 670 if (r_1->is_stack()) { 671 // memory to memory use rax 672 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace; 673 if (!r_2->is_valid()) { 674 // sign extend?? 675 __ movl(rax, Address(rsp, ld_off)); 676 __ movptr(Address(rsp, st_off), rax); 677 678 } else { 679 680 __ movq(rax, Address(rsp, ld_off)); 681 682 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 683 // T_DOUBLE and T_LONG use two slots in the interpreter 684 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 685 // ld_off == LSW, ld_off+wordSize == MSW 686 // st_off == MSW, next_off == LSW 687 __ movq(Address(rsp, next_off), rax); 688 #ifdef ASSERT 689 // Overwrite the unused slot with known junk 690 __ mov64(rax, CONST64(0xdeadffffdeadaaaa)); 691 __ movptr(Address(rsp, st_off), rax); 692 #endif /* ASSERT */ 693 } else { 694 __ movq(Address(rsp, st_off), rax); 695 } 696 } 697 } else if (r_1->is_Register()) { 698 Register r = r_1->as_Register(); 699 if (!r_2->is_valid()) { 700 // must be only an int (or less ) so move only 32bits to slot 701 // why not sign extend?? 702 __ movl(Address(rsp, st_off), r); 703 } else { 704 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 705 // T_DOUBLE and T_LONG use two slots in the interpreter 706 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 707 // long/double in gpr 708 #ifdef ASSERT 709 // Overwrite the unused slot with known junk 710 __ mov64(rax, CONST64(0xdeadffffdeadaaab)); 711 __ movptr(Address(rsp, st_off), rax); 712 #endif /* ASSERT */ 713 __ movq(Address(rsp, next_off), r); 714 } else { 715 __ movptr(Address(rsp, st_off), r); 716 } 717 } 718 } else { 719 assert(r_1->is_XMMRegister(), ""); 720 if (!r_2->is_valid()) { 721 // only a float use just part of the slot 722 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister()); 723 } else { 724 #ifdef ASSERT 725 // Overwrite the unused slot with known junk 726 __ mov64(rax, CONST64(0xdeadffffdeadaaac)); 727 __ movptr(Address(rsp, st_off), rax); 728 #endif /* ASSERT */ 729 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister()); 730 } 731 } 732 } 733 734 // Schedule the branch target address early. 735 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset()))); 736 __ jmp(rcx); 737 } 738 739 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg, 740 address code_start, address code_end, 741 Label& L_ok) { 742 Label L_fail; 743 __ lea(temp_reg, ExternalAddress(code_start)); 744 __ cmpptr(pc_reg, temp_reg); 745 __ jcc(Assembler::belowEqual, L_fail); 746 __ lea(temp_reg, ExternalAddress(code_end)); 747 __ cmpptr(pc_reg, temp_reg); 748 __ jcc(Assembler::below, L_ok); 749 __ bind(L_fail); 750 } 751 752 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm, 753 int total_args_passed, 754 int comp_args_on_stack, 755 const BasicType *sig_bt, 756 const VMRegPair *regs) { 757 758 // Note: r13 contains the senderSP on entry. We must preserve it since 759 // we may do a i2c -> c2i transition if we lose a race where compiled 760 // code goes non-entrant while we get args ready. 761 // In addition we use r13 to locate all the interpreter args as 762 // we must align the stack to 16 bytes on an i2c entry else we 763 // lose alignment we expect in all compiled code and register 764 // save code can segv when fxsave instructions find improperly 765 // aligned stack pointer. 766 767 // Adapters can be frameless because they do not require the caller 768 // to perform additional cleanup work, such as correcting the stack pointer. 769 // An i2c adapter is frameless because the *caller* frame, which is interpreted, 770 // routinely repairs its own stack pointer (from interpreter_frame_last_sp), 771 // even if a callee has modified the stack pointer. 772 // A c2i adapter is frameless because the *callee* frame, which is interpreted, 773 // routinely repairs its caller's stack pointer (from sender_sp, which is set 774 // up via the senderSP register). 775 // In other words, if *either* the caller or callee is interpreted, we can 776 // get the stack pointer repaired after a call. 777 // This is why c2i and i2c adapters cannot be indefinitely composed. 778 // In particular, if a c2i adapter were to somehow call an i2c adapter, 779 // both caller and callee would be compiled methods, and neither would 780 // clean up the stack pointer changes performed by the two adapters. 781 // If this happens, control eventually transfers back to the compiled 782 // caller, but with an uncorrected stack, causing delayed havoc. 783 784 // Pick up the return address 785 __ movptr(rax, Address(rsp, 0)); 786 787 if (VerifyAdapterCalls && 788 (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) { 789 // So, let's test for cascading c2i/i2c adapters right now. 790 // assert(Interpreter::contains($return_addr) || 791 // StubRoutines::contains($return_addr), 792 // "i2c adapter must return to an interpreter frame"); 793 __ block_comment("verify_i2c { "); 794 Label L_ok; 795 if (Interpreter::code() != NULL) 796 range_check(masm, rax, r11, 797 Interpreter::code()->code_start(), Interpreter::code()->code_end(), 798 L_ok); 799 if (StubRoutines::code1() != NULL) 800 range_check(masm, rax, r11, 801 StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(), 802 L_ok); 803 if (StubRoutines::code2() != NULL) 804 range_check(masm, rax, r11, 805 StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(), 806 L_ok); 807 const char* msg = "i2c adapter must return to an interpreter frame"; 808 __ block_comment(msg); 809 __ stop(msg); 810 __ bind(L_ok); 811 __ block_comment("} verify_i2ce "); 812 } 813 814 // Must preserve original SP for loading incoming arguments because 815 // we need to align the outgoing SP for compiled code. 816 __ movptr(r11, rsp); 817 818 // Cut-out for having no stack args. Since up to 2 int/oop args are passed 819 // in registers, we will occasionally have no stack args. 820 int comp_words_on_stack = 0; 821 if (comp_args_on_stack) { 822 // Sig words on the stack are greater-than VMRegImpl::stack0. Those in 823 // registers are below. By subtracting stack0, we either get a negative 824 // number (all values in registers) or the maximum stack slot accessed. 825 826 // Convert 4-byte c2 stack slots to words. 827 comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord; 828 // Round up to miminum stack alignment, in wordSize 829 comp_words_on_stack = align_up(comp_words_on_stack, 2); 830 __ subptr(rsp, comp_words_on_stack * wordSize); 831 } 832 833 834 // Ensure compiled code always sees stack at proper alignment 835 __ andptr(rsp, -16); 836 837 // push the return address and misalign the stack that youngest frame always sees 838 // as far as the placement of the call instruction 839 __ push(rax); 840 841 // Put saved SP in another register 842 const Register saved_sp = rax; 843 __ movptr(saved_sp, r11); 844 845 // Will jump to the compiled code just as if compiled code was doing it. 846 // Pre-load the register-jump target early, to schedule it better. 847 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset()))); 848 849 #if INCLUDE_JVMCI 850 if (EnableJVMCI) { 851 // check if this call should be routed towards a specific entry point 852 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 853 Label no_alternative_target; 854 __ jcc(Assembler::equal, no_alternative_target); 855 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset()))); 856 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 857 __ bind(no_alternative_target); 858 } 859 #endif // INCLUDE_JVMCI 860 861 // Now generate the shuffle code. Pick up all register args and move the 862 // rest through the floating point stack top. 863 for (int i = 0; i < total_args_passed; i++) { 864 if (sig_bt[i] == T_VOID) { 865 // Longs and doubles are passed in native word order, but misaligned 866 // in the 32-bit build. 867 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 868 continue; 869 } 870 871 // Pick up 0, 1 or 2 words from SP+offset. 872 873 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), 874 "scrambled load targets?"); 875 // Load in argument order going down. 876 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize; 877 // Point to interpreter value (vs. tag) 878 int next_off = ld_off - Interpreter::stackElementSize; 879 // 880 // 881 // 882 VMReg r_1 = regs[i].first(); 883 VMReg r_2 = regs[i].second(); 884 if (!r_1->is_valid()) { 885 assert(!r_2->is_valid(), ""); 886 continue; 887 } 888 if (r_1->is_stack()) { 889 // Convert stack slot to an SP offset (+ wordSize to account for return address ) 890 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize; 891 892 // We can use r13 as a temp here because compiled code doesn't need r13 as an input 893 // and if we end up going thru a c2i because of a miss a reasonable value of r13 894 // will be generated. 895 if (!r_2->is_valid()) { 896 // sign extend??? 897 __ movl(r13, Address(saved_sp, ld_off)); 898 __ movptr(Address(rsp, st_off), r13); 899 } else { 900 // 901 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 902 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 903 // So we must adjust where to pick up the data to match the interpreter. 904 // 905 // Interpreter local[n] == MSW, local[n+1] == LSW however locals 906 // are accessed as negative so LSW is at LOW address 907 908 // ld_off is MSW so get LSW 909 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 910 next_off : ld_off; 911 __ movq(r13, Address(saved_sp, offset)); 912 // st_off is LSW (i.e. reg.first()) 913 __ movq(Address(rsp, st_off), r13); 914 } 915 } else if (r_1->is_Register()) { // Register argument 916 Register r = r_1->as_Register(); 917 assert(r != rax, "must be different"); 918 if (r_2->is_valid()) { 919 // 920 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 921 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 922 // So we must adjust where to pick up the data to match the interpreter. 923 924 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 925 next_off : ld_off; 926 927 // this can be a misaligned move 928 __ movq(r, Address(saved_sp, offset)); 929 } else { 930 // sign extend and use a full word? 931 __ movl(r, Address(saved_sp, ld_off)); 932 } 933 } else { 934 if (!r_2->is_valid()) { 935 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off)); 936 } else { 937 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off)); 938 } 939 } 940 } 941 942 // 6243940 We might end up in handle_wrong_method if 943 // the callee is deoptimized as we race thru here. If that 944 // happens we don't want to take a safepoint because the 945 // caller frame will look interpreted and arguments are now 946 // "compiled" so it is much better to make this transition 947 // invisible to the stack walking code. Unfortunately if 948 // we try and find the callee by normal means a safepoint 949 // is possible. So we stash the desired callee in the thread 950 // and the vm will find there should this case occur. 951 952 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx); 953 954 // put Method* where a c2i would expect should we end up there 955 // only needed becaus eof c2 resolve stubs return Method* as a result in 956 // rax 957 __ mov(rax, rbx); 958 __ jmp(r11); 959 } 960 961 // --------------------------------------------------------------- 962 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm, 963 int total_args_passed, 964 int comp_args_on_stack, 965 const BasicType *sig_bt, 966 const VMRegPair *regs, 967 AdapterFingerPrint* fingerprint) { 968 address i2c_entry = __ pc(); 969 970 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs); 971 972 // ------------------------------------------------------------------------- 973 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls 974 // to the interpreter. The args start out packed in the compiled layout. They 975 // need to be unpacked into the interpreter layout. This will almost always 976 // require some stack space. We grow the current (compiled) stack, then repack 977 // the args. We finally end in a jump to the generic interpreter entry point. 978 // On exit from the interpreter, the interpreter will restore our SP (lest the 979 // compiled code, which relys solely on SP and not RBP, get sick). 980 981 address c2i_unverified_entry = __ pc(); 982 Label skip_fixup; 983 Label ok; 984 985 Register holder = rax; 986 Register receiver = j_rarg0; 987 Register temp = rbx; 988 989 { 990 __ load_klass(temp, receiver, rscratch1); 991 __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset())); 992 __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset())); 993 __ jcc(Assembler::equal, ok); 994 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 995 996 __ bind(ok); 997 // Method might have been compiled since the call site was patched to 998 // interpreted if that is the case treat it as a miss so we can get 999 // the call site corrected. 1000 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD); 1001 __ jcc(Assembler::equal, skip_fixup); 1002 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1003 } 1004 1005 address c2i_entry = __ pc(); 1006 1007 // Class initialization barrier for static methods 1008 address c2i_no_clinit_check_entry = NULL; 1009 if (VM_Version::supports_fast_class_init_checks()) { 1010 Label L_skip_barrier; 1011 Register method = rbx; 1012 1013 { // Bypass the barrier for non-static methods 1014 Register flags = rscratch1; 1015 __ movl(flags, Address(method, Method::access_flags_offset())); 1016 __ testl(flags, JVM_ACC_STATIC); 1017 __ jcc(Assembler::zero, L_skip_barrier); // non-static 1018 } 1019 1020 Register klass = rscratch1; 1021 __ load_method_holder(klass, method); 1022 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1023 1024 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1025 1026 __ bind(L_skip_barrier); 1027 c2i_no_clinit_check_entry = __ pc(); 1028 } 1029 1030 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1031 bs->c2i_entry_barrier(masm); 1032 1033 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup); 1034 1035 __ flush(); 1036 return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry); 1037 } 1038 1039 int SharedRuntime::c_calling_convention(const BasicType *sig_bt, 1040 VMRegPair *regs, 1041 VMRegPair *regs2, 1042 int total_args_passed) { 1043 assert(regs2 == NULL, "not needed on x86"); 1044 // We return the amount of VMRegImpl stack slots we need to reserve for all 1045 // the arguments NOT counting out_preserve_stack_slots. 1046 1047 // NOTE: These arrays will have to change when c1 is ported 1048 #ifdef _WIN64 1049 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1050 c_rarg0, c_rarg1, c_rarg2, c_rarg3 1051 }; 1052 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1053 c_farg0, c_farg1, c_farg2, c_farg3 1054 }; 1055 #else 1056 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1057 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5 1058 }; 1059 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1060 c_farg0, c_farg1, c_farg2, c_farg3, 1061 c_farg4, c_farg5, c_farg6, c_farg7 1062 }; 1063 #endif // _WIN64 1064 1065 1066 uint int_args = 0; 1067 uint fp_args = 0; 1068 uint stk_args = 0; // inc by 2 each time 1069 1070 for (int i = 0; i < total_args_passed; i++) { 1071 switch (sig_bt[i]) { 1072 case T_BOOLEAN: 1073 case T_CHAR: 1074 case T_BYTE: 1075 case T_SHORT: 1076 case T_INT: 1077 if (int_args < Argument::n_int_register_parameters_c) { 1078 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 1079 #ifdef _WIN64 1080 fp_args++; 1081 // Allocate slots for callee to stuff register args the stack. 1082 stk_args += 2; 1083 #endif 1084 } else { 1085 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1086 stk_args += 2; 1087 } 1088 break; 1089 case T_LONG: 1090 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1091 // fall through 1092 case T_OBJECT: 1093 case T_ARRAY: 1094 case T_ADDRESS: 1095 case T_METADATA: 1096 if (int_args < Argument::n_int_register_parameters_c) { 1097 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 1098 #ifdef _WIN64 1099 fp_args++; 1100 stk_args += 2; 1101 #endif 1102 } else { 1103 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1104 stk_args += 2; 1105 } 1106 break; 1107 case T_FLOAT: 1108 if (fp_args < Argument::n_float_register_parameters_c) { 1109 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 1110 #ifdef _WIN64 1111 int_args++; 1112 // Allocate slots for callee to stuff register args the stack. 1113 stk_args += 2; 1114 #endif 1115 } else { 1116 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1117 stk_args += 2; 1118 } 1119 break; 1120 case T_DOUBLE: 1121 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1122 if (fp_args < Argument::n_float_register_parameters_c) { 1123 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 1124 #ifdef _WIN64 1125 int_args++; 1126 // Allocate slots for callee to stuff register args the stack. 1127 stk_args += 2; 1128 #endif 1129 } else { 1130 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1131 stk_args += 2; 1132 } 1133 break; 1134 case T_VOID: // Halves of longs and doubles 1135 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 1136 regs[i].set_bad(); 1137 break; 1138 default: 1139 ShouldNotReachHere(); 1140 break; 1141 } 1142 } 1143 #ifdef _WIN64 1144 // windows abi requires that we always allocate enough stack space 1145 // for 4 64bit registers to be stored down. 1146 if (stk_args < 8) { 1147 stk_args = 8; 1148 } 1149 #endif // _WIN64 1150 1151 return stk_args; 1152 } 1153 1154 int SharedRuntime::vector_calling_convention(VMRegPair *regs, 1155 uint num_bits, 1156 uint total_args_passed) { 1157 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512, 1158 "only certain vector sizes are supported for now"); 1159 1160 static const XMMRegister VEC_ArgReg[32] = { 1161 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 1162 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1163 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23, 1164 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31 1165 }; 1166 1167 uint stk_args = 0; 1168 uint fp_args = 0; 1169 1170 for (uint i = 0; i < total_args_passed; i++) { 1171 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg(); 1172 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15)); 1173 regs[i].set_pair(vmreg->next(next_val), vmreg); 1174 } 1175 1176 return stk_args; 1177 } 1178 1179 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1180 // We always ignore the frame_slots arg and just use the space just below frame pointer 1181 // which by this time is free to use 1182 switch (ret_type) { 1183 case T_FLOAT: 1184 __ movflt(Address(rbp, -wordSize), xmm0); 1185 break; 1186 case T_DOUBLE: 1187 __ movdbl(Address(rbp, -wordSize), xmm0); 1188 break; 1189 case T_VOID: break; 1190 default: { 1191 __ movptr(Address(rbp, -wordSize), rax); 1192 } 1193 } 1194 } 1195 1196 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1197 // We always ignore the frame_slots arg and just use the space just below frame pointer 1198 // which by this time is free to use 1199 switch (ret_type) { 1200 case T_FLOAT: 1201 __ movflt(xmm0, Address(rbp, -wordSize)); 1202 break; 1203 case T_DOUBLE: 1204 __ movdbl(xmm0, Address(rbp, -wordSize)); 1205 break; 1206 case T_VOID: break; 1207 default: { 1208 __ movptr(rax, Address(rbp, -wordSize)); 1209 } 1210 } 1211 } 1212 1213 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1214 for ( int i = first_arg ; i < arg_count ; i++ ) { 1215 if (args[i].first()->is_Register()) { 1216 __ push(args[i].first()->as_Register()); 1217 } else if (args[i].first()->is_XMMRegister()) { 1218 __ subptr(rsp, 2*wordSize); 1219 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister()); 1220 } 1221 } 1222 } 1223 1224 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1225 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) { 1226 if (args[i].first()->is_Register()) { 1227 __ pop(args[i].first()->as_Register()); 1228 } else if (args[i].first()->is_XMMRegister()) { 1229 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0)); 1230 __ addptr(rsp, 2*wordSize); 1231 } 1232 } 1233 } 1234 1235 // Unpack an array argument into a pointer to the body and the length 1236 // if the array is non-null, otherwise pass 0 for both. 1237 static void unpack_array_argument(MacroAssembler* masm, VMRegPair reg, BasicType in_elem_type, VMRegPair body_arg, VMRegPair length_arg) { 1238 Register tmp_reg = rax; 1239 assert(!body_arg.first()->is_Register() || body_arg.first()->as_Register() != tmp_reg, 1240 "possible collision"); 1241 assert(!length_arg.first()->is_Register() || length_arg.first()->as_Register() != tmp_reg, 1242 "possible collision"); 1243 1244 __ block_comment("unpack_array_argument {"); 1245 1246 // Pass the length, ptr pair 1247 Label is_null, done; 1248 VMRegPair tmp; 1249 tmp.set_ptr(tmp_reg->as_VMReg()); 1250 if (reg.first()->is_stack()) { 1251 // Load the arg up from the stack 1252 __ move_ptr(reg, tmp); 1253 reg = tmp; 1254 } 1255 __ testptr(reg.first()->as_Register(), reg.first()->as_Register()); 1256 __ jccb(Assembler::equal, is_null); 1257 __ lea(tmp_reg, Address(reg.first()->as_Register(), arrayOopDesc::base_offset_in_bytes(in_elem_type))); 1258 __ move_ptr(tmp, body_arg); 1259 // load the length relative to the body. 1260 __ movl(tmp_reg, Address(tmp_reg, arrayOopDesc::length_offset_in_bytes() - 1261 arrayOopDesc::base_offset_in_bytes(in_elem_type))); 1262 __ move32_64(tmp, length_arg); 1263 __ jmpb(done); 1264 __ bind(is_null); 1265 // Pass zeros 1266 __ xorptr(tmp_reg, tmp_reg); 1267 __ move_ptr(tmp, body_arg); 1268 __ move32_64(tmp, length_arg); 1269 __ bind(done); 1270 1271 __ block_comment("} unpack_array_argument"); 1272 } 1273 1274 1275 // Different signatures may require very different orders for the move 1276 // to avoid clobbering other arguments. There's no simple way to 1277 // order them safely. Compute a safe order for issuing stores and 1278 // break any cycles in those stores. This code is fairly general but 1279 // it's not necessary on the other platforms so we keep it in the 1280 // platform dependent code instead of moving it into a shared file. 1281 // (See bugs 7013347 & 7145024.) 1282 // Note that this code is specific to LP64. 1283 class ComputeMoveOrder: public StackObj { 1284 class MoveOperation: public ResourceObj { 1285 friend class ComputeMoveOrder; 1286 private: 1287 VMRegPair _src; 1288 VMRegPair _dst; 1289 int _src_index; 1290 int _dst_index; 1291 bool _processed; 1292 MoveOperation* _next; 1293 MoveOperation* _prev; 1294 1295 static int get_id(VMRegPair r) { 1296 return r.first()->value(); 1297 } 1298 1299 public: 1300 MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst): 1301 _src(src) 1302 , _dst(dst) 1303 , _src_index(src_index) 1304 , _dst_index(dst_index) 1305 , _processed(false) 1306 , _next(NULL) 1307 , _prev(NULL) { 1308 } 1309 1310 VMRegPair src() const { return _src; } 1311 int src_id() const { return get_id(src()); } 1312 int src_index() const { return _src_index; } 1313 VMRegPair dst() const { return _dst; } 1314 void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; } 1315 int dst_index() const { return _dst_index; } 1316 int dst_id() const { return get_id(dst()); } 1317 MoveOperation* next() const { return _next; } 1318 MoveOperation* prev() const { return _prev; } 1319 void set_processed() { _processed = true; } 1320 bool is_processed() const { return _processed; } 1321 1322 // insert 1323 void break_cycle(VMRegPair temp_register) { 1324 // create a new store following the last store 1325 // to move from the temp_register to the original 1326 MoveOperation* new_store = new MoveOperation(-1, temp_register, dst_index(), dst()); 1327 1328 // break the cycle of links and insert new_store at the end 1329 // break the reverse link. 1330 MoveOperation* p = prev(); 1331 assert(p->next() == this, "must be"); 1332 _prev = NULL; 1333 p->_next = new_store; 1334 new_store->_prev = p; 1335 1336 // change the original store to save it's value in the temp. 1337 set_dst(-1, temp_register); 1338 } 1339 1340 void link(GrowableArray<MoveOperation*>& killer) { 1341 // link this store in front the store that it depends on 1342 MoveOperation* n = killer.at_grow(src_id(), NULL); 1343 if (n != NULL) { 1344 assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet"); 1345 _next = n; 1346 n->_prev = this; 1347 } 1348 } 1349 }; 1350 1351 private: 1352 GrowableArray<MoveOperation*> edges; 1353 1354 public: 1355 ComputeMoveOrder(int total_in_args, const VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs, 1356 const BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) { 1357 // Move operations where the dest is the stack can all be 1358 // scheduled first since they can't interfere with the other moves. 1359 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 1360 if (in_sig_bt[i] == T_ARRAY) { 1361 c_arg--; 1362 if (out_regs[c_arg].first()->is_stack() && 1363 out_regs[c_arg + 1].first()->is_stack()) { 1364 arg_order.push(i); 1365 arg_order.push(c_arg); 1366 } else { 1367 if (out_regs[c_arg].first()->is_stack() || 1368 in_regs[i].first() == out_regs[c_arg].first()) { 1369 add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + 1]); 1370 } else { 1371 add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]); 1372 } 1373 } 1374 } else if (in_sig_bt[i] == T_VOID) { 1375 arg_order.push(i); 1376 arg_order.push(c_arg); 1377 } else { 1378 if (out_regs[c_arg].first()->is_stack() || 1379 in_regs[i].first() == out_regs[c_arg].first()) { 1380 arg_order.push(i); 1381 arg_order.push(c_arg); 1382 } else { 1383 add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]); 1384 } 1385 } 1386 } 1387 // Break any cycles in the register moves and emit the in the 1388 // proper order. 1389 GrowableArray<MoveOperation*>* stores = get_store_order(tmp_vmreg); 1390 for (int i = 0; i < stores->length(); i++) { 1391 arg_order.push(stores->at(i)->src_index()); 1392 arg_order.push(stores->at(i)->dst_index()); 1393 } 1394 } 1395 1396 // Collected all the move operations 1397 void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) { 1398 if (src.first() == dst.first()) return; 1399 edges.append(new MoveOperation(src_index, src, dst_index, dst)); 1400 } 1401 1402 // Walk the edges breaking cycles between moves. The result list 1403 // can be walked in order to produce the proper set of loads 1404 GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) { 1405 // Record which moves kill which values 1406 GrowableArray<MoveOperation*> killer; 1407 for (int i = 0; i < edges.length(); i++) { 1408 MoveOperation* s = edges.at(i); 1409 assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer"); 1410 killer.at_put_grow(s->dst_id(), s, NULL); 1411 } 1412 assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL, 1413 "make sure temp isn't in the registers that are killed"); 1414 1415 // create links between loads and stores 1416 for (int i = 0; i < edges.length(); i++) { 1417 edges.at(i)->link(killer); 1418 } 1419 1420 // at this point, all the move operations are chained together 1421 // in a doubly linked list. Processing it backwards finds 1422 // the beginning of the chain, forwards finds the end. If there's 1423 // a cycle it can be broken at any point, so pick an edge and walk 1424 // backward until the list ends or we end where we started. 1425 GrowableArray<MoveOperation*>* stores = new GrowableArray<MoveOperation*>(); 1426 for (int e = 0; e < edges.length(); e++) { 1427 MoveOperation* s = edges.at(e); 1428 if (!s->is_processed()) { 1429 MoveOperation* start = s; 1430 // search for the beginning of the chain or cycle 1431 while (start->prev() != NULL && start->prev() != s) { 1432 start = start->prev(); 1433 } 1434 if (start->prev() == s) { 1435 start->break_cycle(temp_register); 1436 } 1437 // walk the chain forward inserting to store list 1438 while (start != NULL) { 1439 stores->append(start); 1440 start->set_processed(); 1441 start = start->next(); 1442 } 1443 } 1444 } 1445 return stores; 1446 } 1447 }; 1448 1449 static void verify_oop_args(MacroAssembler* masm, 1450 const methodHandle& method, 1451 const BasicType* sig_bt, 1452 const VMRegPair* regs) { 1453 Register temp_reg = rbx; // not part of any compiled calling seq 1454 if (VerifyOops) { 1455 for (int i = 0; i < method->size_of_parameters(); i++) { 1456 if (is_reference_type(sig_bt[i])) { 1457 VMReg r = regs[i].first(); 1458 assert(r->is_valid(), "bad oop arg"); 1459 if (r->is_stack()) { 1460 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1461 __ verify_oop(temp_reg); 1462 } else { 1463 __ verify_oop(r->as_Register()); 1464 } 1465 } 1466 } 1467 } 1468 } 1469 1470 static void gen_special_dispatch(MacroAssembler* masm, 1471 const methodHandle& method, 1472 const BasicType* sig_bt, 1473 const VMRegPair* regs) { 1474 verify_oop_args(masm, method, sig_bt, regs); 1475 vmIntrinsics::ID iid = method->intrinsic_id(); 1476 1477 // Now write the args into the outgoing interpreter space 1478 bool has_receiver = false; 1479 Register receiver_reg = noreg; 1480 int member_arg_pos = -1; 1481 Register member_reg = noreg; 1482 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid); 1483 if (ref_kind != 0) { 1484 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument 1485 member_reg = rbx; // known to be free at this point 1486 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind); 1487 } else if (iid == vmIntrinsics::_invokeBasic || iid == vmIntrinsics::_linkToNative) { 1488 has_receiver = true; 1489 } else { 1490 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid)); 1491 } 1492 1493 if (member_reg != noreg) { 1494 // Load the member_arg into register, if necessary. 1495 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs); 1496 VMReg r = regs[member_arg_pos].first(); 1497 if (r->is_stack()) { 1498 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1499 } else { 1500 // no data motion is needed 1501 member_reg = r->as_Register(); 1502 } 1503 } 1504 1505 if (has_receiver) { 1506 // Make sure the receiver is loaded into a register. 1507 assert(method->size_of_parameters() > 0, "oob"); 1508 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object"); 1509 VMReg r = regs[0].first(); 1510 assert(r->is_valid(), "bad receiver arg"); 1511 if (r->is_stack()) { 1512 // Porting note: This assumes that compiled calling conventions always 1513 // pass the receiver oop in a register. If this is not true on some 1514 // platform, pick a temp and load the receiver from stack. 1515 fatal("receiver always in a register"); 1516 receiver_reg = j_rarg0; // known to be free at this point 1517 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1518 } else { 1519 // no data motion is needed 1520 receiver_reg = r->as_Register(); 1521 } 1522 } 1523 1524 // Figure out which address we are really jumping to: 1525 MethodHandles::generate_method_handle_dispatch(masm, iid, 1526 receiver_reg, member_reg, /*for_compiler_entry:*/ true); 1527 } 1528 1529 // --------------------------------------------------------------------------- 1530 // Generate a native wrapper for a given method. The method takes arguments 1531 // in the Java compiled code convention, marshals them to the native 1532 // convention (handlizes oops, etc), transitions to native, makes the call, 1533 // returns to java state (possibly blocking), unhandlizes any result and 1534 // returns. 1535 // 1536 // Critical native functions are a shorthand for the use of 1537 // GetPrimtiveArrayCritical and disallow the use of any other JNI 1538 // functions. The wrapper is expected to unpack the arguments before 1539 // passing them to the callee. Critical native functions leave the state _in_Java, 1540 // since they cannot stop for GC. 1541 // Some other parts of JNI setup are skipped like the tear down of the JNI handle 1542 // block and the check for pending exceptions it's impossible for them 1543 // to be thrown. 1544 // 1545 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm, 1546 const methodHandle& method, 1547 int compile_id, 1548 BasicType* in_sig_bt, 1549 VMRegPair* in_regs, 1550 BasicType ret_type, 1551 address critical_entry) { 1552 if (method->is_method_handle_intrinsic()) { 1553 vmIntrinsics::ID iid = method->intrinsic_id(); 1554 intptr_t start = (intptr_t)__ pc(); 1555 int vep_offset = ((intptr_t)__ pc()) - start; 1556 gen_special_dispatch(masm, 1557 method, 1558 in_sig_bt, 1559 in_regs); 1560 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period 1561 __ flush(); 1562 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually 1563 return nmethod::new_native_nmethod(method, 1564 compile_id, 1565 masm->code(), 1566 vep_offset, 1567 frame_complete, 1568 stack_slots / VMRegImpl::slots_per_word, 1569 in_ByteSize(-1), 1570 in_ByteSize(-1), 1571 (OopMapSet*)NULL); 1572 } 1573 bool is_critical_native = true; 1574 address native_func = critical_entry; 1575 if (native_func == NULL) { 1576 native_func = method->native_function(); 1577 is_critical_native = false; 1578 } 1579 assert(native_func != NULL, "must have function"); 1580 1581 // An OopMap for lock (and class if static) 1582 OopMapSet *oop_maps = new OopMapSet(); 1583 intptr_t start = (intptr_t)__ pc(); 1584 1585 // We have received a description of where all the java arg are located 1586 // on entry to the wrapper. We need to convert these args to where 1587 // the jni function will expect them. To figure out where they go 1588 // we convert the java signature to a C signature by inserting 1589 // the hidden arguments as arg[0] and possibly arg[1] (static method) 1590 1591 const int total_in_args = method->size_of_parameters(); 1592 int total_c_args = total_in_args; 1593 if (!is_critical_native) { 1594 total_c_args += 1; 1595 if (method->is_static()) { 1596 total_c_args++; 1597 } 1598 } else { 1599 for (int i = 0; i < total_in_args; i++) { 1600 if (in_sig_bt[i] == T_ARRAY) { 1601 total_c_args++; 1602 } 1603 } 1604 } 1605 1606 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args); 1607 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args); 1608 BasicType* in_elem_bt = NULL; 1609 1610 int argc = 0; 1611 if (!is_critical_native) { 1612 out_sig_bt[argc++] = T_ADDRESS; 1613 if (method->is_static()) { 1614 out_sig_bt[argc++] = T_OBJECT; 1615 } 1616 1617 for (int i = 0; i < total_in_args ; i++ ) { 1618 out_sig_bt[argc++] = in_sig_bt[i]; 1619 } 1620 } else { 1621 in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_in_args); 1622 SignatureStream ss(method->signature()); 1623 for (int i = 0; i < total_in_args ; i++ ) { 1624 if (in_sig_bt[i] == T_ARRAY) { 1625 // Arrays are passed as int, elem* pair 1626 out_sig_bt[argc++] = T_INT; 1627 out_sig_bt[argc++] = T_ADDRESS; 1628 ss.skip_array_prefix(1); // skip one '[' 1629 assert(ss.is_primitive(), "primitive type expected"); 1630 in_elem_bt[i] = ss.type(); 1631 } else { 1632 out_sig_bt[argc++] = in_sig_bt[i]; 1633 in_elem_bt[i] = T_VOID; 1634 } 1635 if (in_sig_bt[i] != T_VOID) { 1636 assert(in_sig_bt[i] == ss.type() || 1637 in_sig_bt[i] == T_ARRAY, "must match"); 1638 ss.next(); 1639 } 1640 } 1641 } 1642 1643 // Now figure out where the args must be stored and how much stack space 1644 // they require. 1645 int out_arg_slots; 1646 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args); 1647 1648 // Compute framesize for the wrapper. We need to handlize all oops in 1649 // incoming registers 1650 1651 // Calculate the total number of stack slots we will need. 1652 1653 // First count the abi requirement plus all of the outgoing args 1654 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots; 1655 1656 // Now the space for the inbound oop handle area 1657 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers 1658 if (is_critical_native) { 1659 // Critical natives may have to call out so they need a save area 1660 // for register arguments. 1661 int double_slots = 0; 1662 int single_slots = 0; 1663 for ( int i = 0; i < total_in_args; i++) { 1664 if (in_regs[i].first()->is_Register()) { 1665 const Register reg = in_regs[i].first()->as_Register(); 1666 switch (in_sig_bt[i]) { 1667 case T_BOOLEAN: 1668 case T_BYTE: 1669 case T_SHORT: 1670 case T_CHAR: 1671 case T_INT: single_slots++; break; 1672 case T_ARRAY: // specific to LP64 (7145024) 1673 case T_LONG: double_slots++; break; 1674 default: ShouldNotReachHere(); 1675 } 1676 } else if (in_regs[i].first()->is_XMMRegister()) { 1677 switch (in_sig_bt[i]) { 1678 case T_FLOAT: single_slots++; break; 1679 case T_DOUBLE: double_slots++; break; 1680 default: ShouldNotReachHere(); 1681 } 1682 } else if (in_regs[i].first()->is_FloatRegister()) { 1683 ShouldNotReachHere(); 1684 } 1685 } 1686 total_save_slots = double_slots * 2 + single_slots; 1687 // align the save area 1688 if (double_slots != 0) { 1689 stack_slots = align_up(stack_slots, 2); 1690 } 1691 } 1692 1693 int oop_handle_offset = stack_slots; 1694 stack_slots += total_save_slots; 1695 1696 // Now any space we need for handlizing a klass if static method 1697 1698 int klass_slot_offset = 0; 1699 int klass_offset = -1; 1700 int lock_slot_offset = 0; 1701 bool is_static = false; 1702 1703 if (method->is_static()) { 1704 klass_slot_offset = stack_slots; 1705 stack_slots += VMRegImpl::slots_per_word; 1706 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size; 1707 is_static = true; 1708 } 1709 1710 // Plus a lock if needed 1711 1712 if (method->is_synchronized()) { 1713 lock_slot_offset = stack_slots; 1714 stack_slots += VMRegImpl::slots_per_word; 1715 } 1716 1717 // Now a place (+2) to save return values or temp during shuffling 1718 // + 4 for return address (which we own) and saved rbp 1719 stack_slots += 6; 1720 1721 // Ok The space we have allocated will look like: 1722 // 1723 // 1724 // FP-> | | 1725 // |---------------------| 1726 // | 2 slots for moves | 1727 // |---------------------| 1728 // | lock box (if sync) | 1729 // |---------------------| <- lock_slot_offset 1730 // | klass (if static) | 1731 // |---------------------| <- klass_slot_offset 1732 // | oopHandle area | 1733 // |---------------------| <- oop_handle_offset (6 java arg registers) 1734 // | outbound memory | 1735 // | based arguments | 1736 // | | 1737 // |---------------------| 1738 // | | 1739 // SP-> | out_preserved_slots | 1740 // 1741 // 1742 1743 1744 // Now compute actual number of stack words we need rounding to make 1745 // stack properly aligned. 1746 stack_slots = align_up(stack_slots, StackAlignmentInSlots); 1747 1748 int stack_size = stack_slots * VMRegImpl::stack_slot_size; 1749 1750 // First thing make an ic check to see if we should even be here 1751 1752 // We are free to use all registers as temps without saving them and 1753 // restoring them except rbp. rbp is the only callee save register 1754 // as far as the interpreter and the compiler(s) are concerned. 1755 1756 1757 const Register ic_reg = rax; 1758 const Register receiver = j_rarg0; 1759 1760 Label hit; 1761 Label exception_pending; 1762 1763 assert_different_registers(ic_reg, receiver, rscratch1); 1764 __ verify_oop(receiver); 1765 __ load_klass(rscratch1, receiver, rscratch2); 1766 __ cmpq(ic_reg, rscratch1); 1767 __ jcc(Assembler::equal, hit); 1768 1769 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1770 1771 // Verified entry point must be aligned 1772 __ align(8); 1773 1774 __ bind(hit); 1775 1776 int vep_offset = ((intptr_t)__ pc()) - start; 1777 1778 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) { 1779 Label L_skip_barrier; 1780 Register klass = r10; 1781 __ mov_metadata(klass, method->method_holder()); // InstanceKlass* 1782 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1783 1784 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1785 1786 __ bind(L_skip_barrier); 1787 } 1788 1789 #ifdef COMPILER1 1790 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available. 1791 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) { 1792 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/); 1793 } 1794 #endif // COMPILER1 1795 1796 // The instruction at the verified entry point must be 5 bytes or longer 1797 // because it can be patched on the fly by make_non_entrant. The stack bang 1798 // instruction fits that requirement. 1799 1800 // Generate stack overflow check 1801 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size()); 1802 1803 // Generate a new frame for the wrapper. 1804 __ enter(); 1805 // -2 because return address is already present and so is saved rbp 1806 __ subptr(rsp, stack_size - 2*wordSize); 1807 1808 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1809 bs->nmethod_entry_barrier(masm); 1810 1811 // Frame is now completed as far as size and linkage. 1812 int frame_complete = ((intptr_t)__ pc()) - start; 1813 1814 if (UseRTMLocking) { 1815 // Abort RTM transaction before calling JNI 1816 // because critical section will be large and will be 1817 // aborted anyway. Also nmethod could be deoptimized. 1818 __ xabort(0); 1819 } 1820 1821 #ifdef ASSERT 1822 { 1823 Label L; 1824 __ mov(rax, rsp); 1825 __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI) 1826 __ cmpptr(rax, rsp); 1827 __ jcc(Assembler::equal, L); 1828 __ stop("improperly aligned stack"); 1829 __ bind(L); 1830 } 1831 #endif /* ASSERT */ 1832 1833 1834 // We use r14 as the oop handle for the receiver/klass 1835 // It is callee save so it survives the call to native 1836 1837 const Register oop_handle_reg = r14; 1838 1839 // 1840 // We immediately shuffle the arguments so that any vm call we have to 1841 // make from here on out (sync slow path, jvmti, etc.) we will have 1842 // captured the oops from our caller and have a valid oopMap for 1843 // them. 1844 1845 // ----------------- 1846 // The Grand Shuffle 1847 1848 // The Java calling convention is either equal (linux) or denser (win64) than the 1849 // c calling convention. However the because of the jni_env argument the c calling 1850 // convention always has at least one more (and two for static) arguments than Java. 1851 // Therefore if we move the args from java -> c backwards then we will never have 1852 // a register->register conflict and we don't have to build a dependency graph 1853 // and figure out how to break any cycles. 1854 // 1855 1856 // Record esp-based slot for receiver on stack for non-static methods 1857 int receiver_offset = -1; 1858 1859 // This is a trick. We double the stack slots so we can claim 1860 // the oops in the caller's frame. Since we are sure to have 1861 // more args than the caller doubling is enough to make 1862 // sure we can capture all the incoming oop args from the 1863 // caller. 1864 // 1865 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/); 1866 1867 // Mark location of rbp (someday) 1868 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp)); 1869 1870 // Use eax, ebx as temporaries during any memory-memory moves we have to do 1871 // All inbound args are referenced based on rbp and all outbound args via rsp. 1872 1873 1874 #ifdef ASSERT 1875 bool reg_destroyed[RegisterImpl::number_of_registers]; 1876 bool freg_destroyed[XMMRegisterImpl::number_of_registers]; 1877 for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) { 1878 reg_destroyed[r] = false; 1879 } 1880 for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) { 1881 freg_destroyed[f] = false; 1882 } 1883 1884 #endif /* ASSERT */ 1885 1886 // This may iterate in two different directions depending on the 1887 // kind of native it is. The reason is that for regular JNI natives 1888 // the incoming and outgoing registers are offset upwards and for 1889 // critical natives they are offset down. 1890 GrowableArray<int> arg_order(2 * total_in_args); 1891 1892 VMRegPair tmp_vmreg; 1893 tmp_vmreg.set2(rbx->as_VMReg()); 1894 1895 if (!is_critical_native) { 1896 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 1897 arg_order.push(i); 1898 arg_order.push(c_arg); 1899 } 1900 } else { 1901 // Compute a valid move order, using tmp_vmreg to break any cycles 1902 ComputeMoveOrder cmo(total_in_args, in_regs, total_c_args, out_regs, in_sig_bt, arg_order, tmp_vmreg); 1903 } 1904 1905 int temploc = -1; 1906 for (int ai = 0; ai < arg_order.length(); ai += 2) { 1907 int i = arg_order.at(ai); 1908 int c_arg = arg_order.at(ai + 1); 1909 __ block_comment(err_msg("move %d -> %d", i, c_arg)); 1910 if (c_arg == -1) { 1911 assert(is_critical_native, "should only be required for critical natives"); 1912 // This arg needs to be moved to a temporary 1913 __ mov(tmp_vmreg.first()->as_Register(), in_regs[i].first()->as_Register()); 1914 in_regs[i] = tmp_vmreg; 1915 temploc = i; 1916 continue; 1917 } else if (i == -1) { 1918 assert(is_critical_native, "should only be required for critical natives"); 1919 // Read from the temporary location 1920 assert(temploc != -1, "must be valid"); 1921 i = temploc; 1922 temploc = -1; 1923 } 1924 #ifdef ASSERT 1925 if (in_regs[i].first()->is_Register()) { 1926 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!"); 1927 } else if (in_regs[i].first()->is_XMMRegister()) { 1928 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!"); 1929 } 1930 if (out_regs[c_arg].first()->is_Register()) { 1931 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 1932 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 1933 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 1934 } 1935 #endif /* ASSERT */ 1936 switch (in_sig_bt[i]) { 1937 case T_ARRAY: 1938 if (is_critical_native) { 1939 unpack_array_argument(masm, in_regs[i], in_elem_bt[i], out_regs[c_arg + 1], out_regs[c_arg]); 1940 c_arg++; 1941 #ifdef ASSERT 1942 if (out_regs[c_arg].first()->is_Register()) { 1943 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 1944 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 1945 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 1946 } 1947 #endif 1948 break; 1949 } 1950 case T_OBJECT: 1951 assert(!is_critical_native, "no oop arguments"); 1952 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg], 1953 ((i == 0) && (!is_static)), 1954 &receiver_offset); 1955 break; 1956 case T_VOID: 1957 break; 1958 1959 case T_FLOAT: 1960 __ float_move(in_regs[i], out_regs[c_arg]); 1961 break; 1962 1963 case T_DOUBLE: 1964 assert( i + 1 < total_in_args && 1965 in_sig_bt[i + 1] == T_VOID && 1966 out_sig_bt[c_arg+1] == T_VOID, "bad arg list"); 1967 __ double_move(in_regs[i], out_regs[c_arg]); 1968 break; 1969 1970 case T_LONG : 1971 __ long_move(in_regs[i], out_regs[c_arg]); 1972 break; 1973 1974 case T_ADDRESS: assert(false, "found T_ADDRESS in java args"); 1975 1976 default: 1977 __ move32_64(in_regs[i], out_regs[c_arg]); 1978 } 1979 } 1980 1981 int c_arg; 1982 1983 // Pre-load a static method's oop into r14. Used both by locking code and 1984 // the normal JNI call code. 1985 if (!is_critical_native) { 1986 // point c_arg at the first arg that is already loaded in case we 1987 // need to spill before we call out 1988 c_arg = total_c_args - total_in_args; 1989 1990 if (method->is_static()) { 1991 1992 // load oop into a register 1993 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror())); 1994 1995 // Now handlize the static class mirror it's known not-null. 1996 __ movptr(Address(rsp, klass_offset), oop_handle_reg); 1997 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset)); 1998 1999 // Now get the handle 2000 __ lea(oop_handle_reg, Address(rsp, klass_offset)); 2001 // store the klass handle as second argument 2002 __ movptr(c_rarg1, oop_handle_reg); 2003 // and protect the arg if we must spill 2004 c_arg--; 2005 } 2006 } else { 2007 // For JNI critical methods we need to save all registers in save_args. 2008 c_arg = 0; 2009 } 2010 2011 // Change state to native (we save the return address in the thread, since it might not 2012 // be pushed on the stack when we do a a stack traversal). It is enough that the pc() 2013 // points into the right code segment. It does not have to be the correct return pc. 2014 // We use the same pc/oopMap repeatedly when we call out 2015 2016 intptr_t the_pc = (intptr_t) __ pc(); 2017 oop_maps->add_gc_map(the_pc - start, map); 2018 2019 __ set_last_Java_frame(rsp, noreg, (address)the_pc); 2020 2021 2022 // We have all of the arguments setup at this point. We must not touch any register 2023 // argument registers at this point (what if we save/restore them there are no oop? 2024 2025 { 2026 SkipIfEqual skip(masm, &DTraceMethodProbes, false); 2027 // protect the args we've loaded 2028 save_args(masm, total_c_args, c_arg, out_regs); 2029 __ mov_metadata(c_rarg1, method()); 2030 __ call_VM_leaf( 2031 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry), 2032 r15_thread, c_rarg1); 2033 restore_args(masm, total_c_args, c_arg, out_regs); 2034 } 2035 2036 // RedefineClasses() tracing support for obsolete method entry 2037 if (log_is_enabled(Trace, redefine, class, obsolete)) { 2038 // protect the args we've loaded 2039 save_args(masm, total_c_args, c_arg, out_regs); 2040 __ mov_metadata(c_rarg1, method()); 2041 __ call_VM_leaf( 2042 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry), 2043 r15_thread, c_rarg1); 2044 restore_args(masm, total_c_args, c_arg, out_regs); 2045 } 2046 2047 // Lock a synchronized method 2048 2049 // Register definitions used by locking and unlocking 2050 2051 const Register swap_reg = rax; // Must use rax for cmpxchg instruction 2052 const Register obj_reg = rbx; // Will contain the oop 2053 const Register lock_reg = r13; // Address of compiler lock object (BasicLock) 2054 const Register old_hdr = r13; // value of old header at unlock time 2055 2056 Label slow_path_lock; 2057 Label lock_done; 2058 2059 if (method->is_synchronized()) { 2060 assert(!is_critical_native, "unhandled"); 2061 2062 2063 const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes(); 2064 2065 // Get the handle (the 2nd argument) 2066 __ mov(oop_handle_reg, c_rarg1); 2067 2068 // Get address of the box 2069 2070 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2071 2072 // Load the oop from the handle 2073 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2074 2075 if (UseBiasedLocking) { 2076 __ biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch1, rscratch2, false, lock_done, &slow_path_lock); 2077 } 2078 2079 // Load immediate 1 into swap_reg %rax 2080 __ movl(swap_reg, 1); 2081 2082 // Load (object->mark() | 1) into swap_reg %rax 2083 __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2084 2085 // Save (object->mark() | 1) into BasicLock's displaced header 2086 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2087 2088 // src -> dest iff dest == rax else rax <- dest 2089 __ lock(); 2090 __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2091 __ jcc(Assembler::equal, lock_done); 2092 2093 // Hmm should this move to the slow path code area??? 2094 2095 // Test if the oopMark is an obvious stack pointer, i.e., 2096 // 1) (mark & 3) == 0, and 2097 // 2) rsp <= mark < mark + os::pagesize() 2098 // These 3 tests can be done by evaluating the following 2099 // expression: ((mark - rsp) & (3 - os::vm_page_size())), 2100 // assuming both stack pointer and pagesize have their 2101 // least significant 2 bits clear. 2102 // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg 2103 2104 __ subptr(swap_reg, rsp); 2105 __ andptr(swap_reg, 3 - os::vm_page_size()); 2106 2107 // Save the test result, for recursive case, the result is zero 2108 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2109 __ jcc(Assembler::notEqual, slow_path_lock); 2110 2111 // Slow path will re-enter here 2112 2113 __ bind(lock_done); 2114 } 2115 2116 // Finally just about ready to make the JNI call 2117 2118 // get JNIEnv* which is first argument to native 2119 if (!is_critical_native) { 2120 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset()))); 2121 2122 // Now set thread in native 2123 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 2124 } 2125 2126 __ call(RuntimeAddress(native_func)); 2127 2128 // Verify or restore cpu control state after JNI call 2129 __ restore_cpu_control_state_after_jni(); 2130 2131 // Unpack native results. 2132 switch (ret_type) { 2133 case T_BOOLEAN: __ c2bool(rax); break; 2134 case T_CHAR : __ movzwl(rax, rax); break; 2135 case T_BYTE : __ sign_extend_byte (rax); break; 2136 case T_SHORT : __ sign_extend_short(rax); break; 2137 case T_INT : /* nothing to do */ break; 2138 case T_DOUBLE : 2139 case T_FLOAT : 2140 // Result is in xmm0 we'll save as needed 2141 break; 2142 case T_ARRAY: // Really a handle 2143 case T_OBJECT: // Really a handle 2144 break; // can't de-handlize until after safepoint check 2145 case T_VOID: break; 2146 case T_LONG: break; 2147 default : ShouldNotReachHere(); 2148 } 2149 2150 Label after_transition; 2151 2152 // If this is a critical native, check for a safepoint or suspend request after the call. 2153 // If a safepoint is needed, transition to native, then to native_trans to handle 2154 // safepoints like the native methods that are not critical natives. 2155 if (is_critical_native) { 2156 Label needs_safepoint; 2157 __ safepoint_poll(needs_safepoint, r15_thread, false /* at_return */, false /* in_nmethod */); 2158 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2159 __ jcc(Assembler::equal, after_transition); 2160 __ bind(needs_safepoint); 2161 } 2162 2163 // Switch thread to "native transition" state before reading the synchronization state. 2164 // This additional state is necessary because reading and testing the synchronization 2165 // state is not atomic w.r.t. GC, as this scenario demonstrates: 2166 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted. 2167 // VM thread changes sync state to synchronizing and suspends threads for GC. 2168 // Thread A is resumed to finish this native method, but doesn't block here since it 2169 // didn't see any synchronization is progress, and escapes. 2170 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 2171 2172 // Force this write out before the read below 2173 __ membar(Assembler::Membar_mask_bits( 2174 Assembler::LoadLoad | Assembler::LoadStore | 2175 Assembler::StoreLoad | Assembler::StoreStore)); 2176 2177 // check for safepoint operation in progress and/or pending suspend requests 2178 { 2179 Label Continue; 2180 Label slow_path; 2181 2182 __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */); 2183 2184 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2185 __ jcc(Assembler::equal, Continue); 2186 __ bind(slow_path); 2187 2188 // Don't use call_VM as it will see a possible pending exception and forward it 2189 // and never return here preventing us from clearing _last_native_pc down below. 2190 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are 2191 // preserved and correspond to the bcp/locals pointers. So we do a runtime call 2192 // by hand. 2193 // 2194 __ vzeroupper(); 2195 save_native_result(masm, ret_type, stack_slots); 2196 __ mov(c_rarg0, r15_thread); 2197 __ mov(r12, rsp); // remember sp 2198 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2199 __ andptr(rsp, -16); // align stack as required by ABI 2200 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 2201 __ mov(rsp, r12); // restore sp 2202 __ reinit_heapbase(); 2203 // Restore any method result value 2204 restore_native_result(masm, ret_type, stack_slots); 2205 __ bind(Continue); 2206 } 2207 2208 // change thread state 2209 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 2210 __ bind(after_transition); 2211 2212 Label reguard; 2213 Label reguard_done; 2214 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 2215 __ jcc(Assembler::equal, reguard); 2216 __ bind(reguard_done); 2217 2218 // native result if any is live 2219 2220 // Unlock 2221 Label unlock_done; 2222 Label slow_path_unlock; 2223 if (method->is_synchronized()) { 2224 2225 // Get locked oop from the handle we passed to jni 2226 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2227 2228 Label done; 2229 2230 if (UseBiasedLocking) { 2231 __ biased_locking_exit(obj_reg, old_hdr, done); 2232 } 2233 2234 // Simple recursive lock? 2235 2236 __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD); 2237 __ jcc(Assembler::equal, done); 2238 2239 // Must save rax if if it is live now because cmpxchg must use it 2240 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2241 save_native_result(masm, ret_type, stack_slots); 2242 } 2243 2244 2245 // get address of the stack lock 2246 __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2247 // get old displaced header 2248 __ movptr(old_hdr, Address(rax, 0)); 2249 2250 // Atomic swap old header if oop still contains the stack lock 2251 __ lock(); 2252 __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2253 __ jcc(Assembler::notEqual, slow_path_unlock); 2254 2255 // slow path re-enters here 2256 __ bind(unlock_done); 2257 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2258 restore_native_result(masm, ret_type, stack_slots); 2259 } 2260 2261 __ bind(done); 2262 2263 } 2264 { 2265 SkipIfEqual skip(masm, &DTraceMethodProbes, false); 2266 save_native_result(masm, ret_type, stack_slots); 2267 __ mov_metadata(c_rarg1, method()); 2268 __ call_VM_leaf( 2269 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit), 2270 r15_thread, c_rarg1); 2271 restore_native_result(masm, ret_type, stack_slots); 2272 } 2273 2274 __ reset_last_Java_frame(false); 2275 2276 // Unbox oop result, e.g. JNIHandles::resolve value. 2277 if (is_reference_type(ret_type)) { 2278 __ resolve_jobject(rax /* value */, 2279 r15_thread /* thread */, 2280 rcx /* tmp */); 2281 } 2282 2283 if (CheckJNICalls) { 2284 // clear_pending_jni_exception_check 2285 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD); 2286 } 2287 2288 if (!is_critical_native) { 2289 // reset handle block 2290 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset())); 2291 __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD); 2292 } 2293 2294 // pop our frame 2295 2296 __ leave(); 2297 2298 if (!is_critical_native) { 2299 // Any exception pending? 2300 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD); 2301 __ jcc(Assembler::notEqual, exception_pending); 2302 } 2303 2304 // Return 2305 2306 __ ret(0); 2307 2308 // Unexpected paths are out of line and go here 2309 2310 if (!is_critical_native) { 2311 // forward the exception 2312 __ bind(exception_pending); 2313 2314 // and forward the exception 2315 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 2316 } 2317 2318 // Slow path locking & unlocking 2319 if (method->is_synchronized()) { 2320 2321 // BEGIN Slow path lock 2322 __ bind(slow_path_lock); 2323 2324 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM 2325 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2326 2327 // protect the args we've loaded 2328 save_args(masm, total_c_args, c_arg, out_regs); 2329 2330 __ mov(c_rarg0, obj_reg); 2331 __ mov(c_rarg1, lock_reg); 2332 __ mov(c_rarg2, r15_thread); 2333 2334 // Not a leaf but we have last_Java_frame setup as we want 2335 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3); 2336 restore_args(masm, total_c_args, c_arg, out_regs); 2337 2338 #ifdef ASSERT 2339 { Label L; 2340 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD); 2341 __ jcc(Assembler::equal, L); 2342 __ stop("no pending exception allowed on exit from monitorenter"); 2343 __ bind(L); 2344 } 2345 #endif 2346 __ jmp(lock_done); 2347 2348 // END Slow path lock 2349 2350 // BEGIN Slow path unlock 2351 __ bind(slow_path_unlock); 2352 2353 // If we haven't already saved the native result we must save it now as xmm registers 2354 // are still exposed. 2355 __ vzeroupper(); 2356 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2357 save_native_result(masm, ret_type, stack_slots); 2358 } 2359 2360 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2361 2362 __ mov(c_rarg0, obj_reg); 2363 __ mov(c_rarg2, r15_thread); 2364 __ mov(r12, rsp); // remember sp 2365 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2366 __ andptr(rsp, -16); // align stack as required by ABI 2367 2368 // Save pending exception around call to VM (which contains an EXCEPTION_MARK) 2369 // NOTE that obj_reg == rbx currently 2370 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset()))); 2371 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD); 2372 2373 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2374 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C))); 2375 __ mov(rsp, r12); // restore sp 2376 __ reinit_heapbase(); 2377 #ifdef ASSERT 2378 { 2379 Label L; 2380 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD); 2381 __ jcc(Assembler::equal, L); 2382 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C"); 2383 __ bind(L); 2384 } 2385 #endif /* ASSERT */ 2386 2387 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx); 2388 2389 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2390 restore_native_result(masm, ret_type, stack_slots); 2391 } 2392 __ jmp(unlock_done); 2393 2394 // END Slow path unlock 2395 2396 } // synchronized 2397 2398 // SLOW PATH Reguard the stack if needed 2399 2400 __ bind(reguard); 2401 __ vzeroupper(); 2402 save_native_result(masm, ret_type, stack_slots); 2403 __ mov(r12, rsp); // remember sp 2404 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2405 __ andptr(rsp, -16); // align stack as required by ABI 2406 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 2407 __ mov(rsp, r12); // restore sp 2408 __ reinit_heapbase(); 2409 restore_native_result(masm, ret_type, stack_slots); 2410 // and continue 2411 __ jmp(reguard_done); 2412 2413 2414 2415 __ flush(); 2416 2417 nmethod *nm = nmethod::new_native_nmethod(method, 2418 compile_id, 2419 masm->code(), 2420 vep_offset, 2421 frame_complete, 2422 stack_slots / VMRegImpl::slots_per_word, 2423 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)), 2424 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size), 2425 oop_maps); 2426 2427 return nm; 2428 } 2429 2430 // this function returns the adjust size (in number of words) to a c2i adapter 2431 // activation for use during deoptimization 2432 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) { 2433 return (callee_locals - callee_parameters) * Interpreter::stackElementWords; 2434 } 2435 2436 2437 uint SharedRuntime::out_preserve_stack_slots() { 2438 return 0; 2439 } 2440 2441 2442 // Number of stack slots between incoming argument block and the start of 2443 // a new frame. The PROLOG must add this many slots to the stack. The 2444 // EPILOG must remove this many slots. amd64 needs two slots for 2445 // return address. 2446 uint SharedRuntime::in_preserve_stack_slots() { 2447 return 4 + 2 * VerifyStackAtCalls; 2448 } 2449 2450 //------------------------------generate_deopt_blob---------------------------- 2451 void SharedRuntime::generate_deopt_blob() { 2452 // Allocate space for the code 2453 ResourceMark rm; 2454 // Setup code generation tools 2455 int pad = 0; 2456 if (UseAVX > 2) { 2457 pad += 1024; 2458 } 2459 #if INCLUDE_JVMCI 2460 if (EnableJVMCI) { 2461 pad += 512; // Increase the buffer size when compiling for JVMCI 2462 } 2463 #endif 2464 CodeBuffer buffer("deopt_blob", 2560+pad, 1024); 2465 MacroAssembler* masm = new MacroAssembler(&buffer); 2466 int frame_size_in_words; 2467 OopMap* map = NULL; 2468 OopMapSet *oop_maps = new OopMapSet(); 2469 2470 // ------------- 2471 // This code enters when returning to a de-optimized nmethod. A return 2472 // address has been pushed on the the stack, and return values are in 2473 // registers. 2474 // If we are doing a normal deopt then we were called from the patched 2475 // nmethod from the point we returned to the nmethod. So the return 2476 // address on the stack is wrong by NativeCall::instruction_size 2477 // We will adjust the value so it looks like we have the original return 2478 // address on the stack (like when we eagerly deoptimized). 2479 // In the case of an exception pending when deoptimizing, we enter 2480 // with a return address on the stack that points after the call we patched 2481 // into the exception handler. We have the following register state from, 2482 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp). 2483 // rax: exception oop 2484 // rbx: exception handler 2485 // rdx: throwing pc 2486 // So in this case we simply jam rdx into the useless return address and 2487 // the stack looks just like we want. 2488 // 2489 // At this point we need to de-opt. We save the argument return 2490 // registers. We call the first C routine, fetch_unroll_info(). This 2491 // routine captures the return values and returns a structure which 2492 // describes the current frame size and the sizes of all replacement frames. 2493 // The current frame is compiled code and may contain many inlined 2494 // functions, each with their own JVM state. We pop the current frame, then 2495 // push all the new frames. Then we call the C routine unpack_frames() to 2496 // populate these frames. Finally unpack_frames() returns us the new target 2497 // address. Notice that callee-save registers are BLOWN here; they have 2498 // already been captured in the vframeArray at the time the return PC was 2499 // patched. 2500 address start = __ pc(); 2501 Label cont; 2502 2503 // Prolog for non exception case! 2504 2505 // Save everything in sight. 2506 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2507 2508 // Normal deoptimization. Save exec mode for unpack_frames. 2509 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved 2510 __ jmp(cont); 2511 2512 int reexecute_offset = __ pc() - start; 2513 #if INCLUDE_JVMCI && !defined(COMPILER1) 2514 if (EnableJVMCI && UseJVMCICompiler) { 2515 // JVMCI does not use this kind of deoptimization 2516 __ should_not_reach_here(); 2517 } 2518 #endif 2519 2520 // Reexecute case 2521 // return address is the pc describes what bci to do re-execute at 2522 2523 // No need to update map as each call to save_live_registers will produce identical oopmap 2524 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2525 2526 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved 2527 __ jmp(cont); 2528 2529 #if INCLUDE_JVMCI 2530 Label after_fetch_unroll_info_call; 2531 int implicit_exception_uncommon_trap_offset = 0; 2532 int uncommon_trap_offset = 0; 2533 2534 if (EnableJVMCI) { 2535 implicit_exception_uncommon_trap_offset = __ pc() - start; 2536 2537 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset()))); 2538 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), (int32_t)NULL_WORD); 2539 2540 uncommon_trap_offset = __ pc() - start; 2541 2542 // Save everything in sight. 2543 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2544 // fetch_unroll_info needs to call last_java_frame() 2545 __ set_last_Java_frame(noreg, noreg, NULL); 2546 2547 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset()))); 2548 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1); 2549 2550 __ movl(r14, (int32_t)Deoptimization::Unpack_reexecute); 2551 __ mov(c_rarg0, r15_thread); 2552 __ movl(c_rarg2, r14); // exec mode 2553 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2554 oop_maps->add_gc_map( __ pc()-start, map->deep_copy()); 2555 2556 __ reset_last_Java_frame(false); 2557 2558 __ jmp(after_fetch_unroll_info_call); 2559 } // EnableJVMCI 2560 #endif // INCLUDE_JVMCI 2561 2562 int exception_offset = __ pc() - start; 2563 2564 // Prolog for exception case 2565 2566 // all registers are dead at this entry point, except for rax, and 2567 // rdx which contain the exception oop and exception pc 2568 // respectively. Set them in TLS and fall thru to the 2569 // unpack_with_exception_in_tls entry point. 2570 2571 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 2572 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax); 2573 2574 int exception_in_tls_offset = __ pc() - start; 2575 2576 // new implementation because exception oop is now passed in JavaThread 2577 2578 // Prolog for exception case 2579 // All registers must be preserved because they might be used by LinearScan 2580 // Exceptiop oop and throwing PC are passed in JavaThread 2581 // tos: stack at point of call to method that threw the exception (i.e. only 2582 // args are on the stack, no return address) 2583 2584 // make room on stack for the return address 2585 // It will be patched later with the throwing pc. The correct value is not 2586 // available now because loading it from memory would destroy registers. 2587 __ push(0); 2588 2589 // Save everything in sight. 2590 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2591 2592 // Now it is safe to overwrite any register 2593 2594 // Deopt during an exception. Save exec mode for unpack_frames. 2595 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved 2596 2597 // load throwing pc from JavaThread and patch it as the return address 2598 // of the current frame. Then clear the field in JavaThread 2599 2600 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2601 __ movptr(Address(rbp, wordSize), rdx); 2602 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD); 2603 2604 #ifdef ASSERT 2605 // verify that there is really an exception oop in JavaThread 2606 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2607 __ verify_oop(rax); 2608 2609 // verify that there is no pending exception 2610 Label no_pending_exception; 2611 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 2612 __ testptr(rax, rax); 2613 __ jcc(Assembler::zero, no_pending_exception); 2614 __ stop("must not have pending exception here"); 2615 __ bind(no_pending_exception); 2616 #endif 2617 2618 __ bind(cont); 2619 2620 // Call C code. Need thread and this frame, but NOT official VM entry 2621 // crud. We cannot block on this call, no GC can happen. 2622 // 2623 // UnrollBlock* fetch_unroll_info(JavaThread* thread) 2624 2625 // fetch_unroll_info needs to call last_java_frame(). 2626 2627 __ set_last_Java_frame(noreg, noreg, NULL); 2628 #ifdef ASSERT 2629 { Label L; 2630 __ cmpptr(Address(r15_thread, 2631 JavaThread::last_Java_fp_offset()), 2632 (int32_t)0); 2633 __ jcc(Assembler::equal, L); 2634 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared"); 2635 __ bind(L); 2636 } 2637 #endif // ASSERT 2638 __ mov(c_rarg0, r15_thread); 2639 __ movl(c_rarg1, r14); // exec_mode 2640 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info))); 2641 2642 // Need to have an oopmap that tells fetch_unroll_info where to 2643 // find any register it might need. 2644 oop_maps->add_gc_map(__ pc() - start, map); 2645 2646 __ reset_last_Java_frame(false); 2647 2648 #if INCLUDE_JVMCI 2649 if (EnableJVMCI) { 2650 __ bind(after_fetch_unroll_info_call); 2651 } 2652 #endif 2653 2654 // Load UnrollBlock* into rdi 2655 __ mov(rdi, rax); 2656 2657 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes())); 2658 Label noException; 2659 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending? 2660 __ jcc(Assembler::notEqual, noException); 2661 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2662 // QQQ this is useless it was NULL above 2663 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2664 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD); 2665 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD); 2666 2667 __ verify_oop(rax); 2668 2669 // Overwrite the result registers with the exception results. 2670 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2671 // I think this is useless 2672 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx); 2673 2674 __ bind(noException); 2675 2676 // Only register save data is on the stack. 2677 // Now restore the result registers. Everything else is either dead 2678 // or captured in the vframeArray. 2679 RegisterSaver::restore_result_registers(masm); 2680 2681 // All of the register save area has been popped of the stack. Only the 2682 // return address remains. 2683 2684 // Pop all the frames we must move/replace. 2685 // 2686 // Frame picture (youngest to oldest) 2687 // 1: self-frame (no frame link) 2688 // 2: deopting frame (no frame link) 2689 // 3: caller of deopting frame (could be compiled/interpreted). 2690 // 2691 // Note: by leaving the return address of self-frame on the stack 2692 // and using the size of frame 2 to adjust the stack 2693 // when we are done the return to frame 3 will still be on the stack. 2694 2695 // Pop deoptimized frame 2696 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes())); 2697 __ addptr(rsp, rcx); 2698 2699 // rsp should be pointing at the return address to the caller (3) 2700 2701 // Pick up the initial fp we should save 2702 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2703 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes())); 2704 2705 #ifdef ASSERT 2706 // Compilers generate code that bang the stack by as much as the 2707 // interpreter would need. So this stack banging should never 2708 // trigger a fault. Verify that it does not on non product builds. 2709 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes())); 2710 __ bang_stack_size(rbx, rcx); 2711 #endif 2712 2713 // Load address of array of frame pcs into rcx 2714 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes())); 2715 2716 // Trash the old pc 2717 __ addptr(rsp, wordSize); 2718 2719 // Load address of array of frame sizes into rsi 2720 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes())); 2721 2722 // Load counter into rdx 2723 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes())); 2724 2725 // Now adjust the caller's stack to make up for the extra locals 2726 // but record the original sp so that we can save it in the skeletal interpreter 2727 // frame and the stack walking of interpreter_sender will get the unextended sp 2728 // value and not the "real" sp value. 2729 2730 const Register sender_sp = r8; 2731 2732 __ mov(sender_sp, rsp); 2733 __ movl(rbx, Address(rdi, 2734 Deoptimization::UnrollBlock:: 2735 caller_adjustment_offset_in_bytes())); 2736 __ subptr(rsp, rbx); 2737 2738 // Push interpreter frames in a loop 2739 Label loop; 2740 __ bind(loop); 2741 __ movptr(rbx, Address(rsi, 0)); // Load frame size 2742 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand 2743 __ pushptr(Address(rcx, 0)); // Save return address 2744 __ enter(); // Save old & set new ebp 2745 __ subptr(rsp, rbx); // Prolog 2746 // This value is corrected by layout_activation_impl 2747 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD ); 2748 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable 2749 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 2750 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 2751 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 2752 __ decrementl(rdx); // Decrement counter 2753 __ jcc(Assembler::notZero, loop); 2754 __ pushptr(Address(rcx, 0)); // Save final return address 2755 2756 // Re-push self-frame 2757 __ enter(); // Save old & set new ebp 2758 2759 // Allocate a full sized register save area. 2760 // Return address and rbp are in place, so we allocate two less words. 2761 __ subptr(rsp, (frame_size_in_words - 2) * wordSize); 2762 2763 // Restore frame locals after moving the frame 2764 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0); 2765 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2766 2767 // Call C code. Need thread but NOT official VM entry 2768 // crud. We cannot block on this call, no GC can happen. Call should 2769 // restore return values to their stack-slots with the new SP. 2770 // 2771 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode) 2772 2773 // Use rbp because the frames look interpreted now 2774 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 2775 // Don't need the precise return PC here, just precise enough to point into this code blob. 2776 address the_pc = __ pc(); 2777 __ set_last_Java_frame(noreg, rbp, the_pc); 2778 2779 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI 2780 __ mov(c_rarg0, r15_thread); 2781 __ movl(c_rarg1, r14); // second arg: exec_mode 2782 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 2783 // Revert SP alignment after call since we're going to do some SP relative addressing below 2784 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset())); 2785 2786 // Set an oopmap for the call site 2787 // Use the same PC we used for the last java frame 2788 oop_maps->add_gc_map(the_pc - start, 2789 new OopMap( frame_size_in_words, 0 )); 2790 2791 // Clear fp AND pc 2792 __ reset_last_Java_frame(true); 2793 2794 // Collect return values 2795 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes())); 2796 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes())); 2797 // I think this is useless (throwing pc?) 2798 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes())); 2799 2800 // Pop self-frame. 2801 __ leave(); // Epilog 2802 2803 // Jump to interpreter 2804 __ ret(0); 2805 2806 // Make sure all code is generated 2807 masm->flush(); 2808 2809 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words); 2810 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset); 2811 #if INCLUDE_JVMCI 2812 if (EnableJVMCI) { 2813 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset); 2814 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset); 2815 } 2816 #endif 2817 } 2818 2819 #ifdef COMPILER2 2820 //------------------------------generate_uncommon_trap_blob-------------------- 2821 void SharedRuntime::generate_uncommon_trap_blob() { 2822 // Allocate space for the code 2823 ResourceMark rm; 2824 // Setup code generation tools 2825 CodeBuffer buffer("uncommon_trap_blob", 2048, 1024); 2826 MacroAssembler* masm = new MacroAssembler(&buffer); 2827 2828 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 2829 2830 address start = __ pc(); 2831 2832 if (UseRTMLocking) { 2833 // Abort RTM transaction before possible nmethod deoptimization. 2834 __ xabort(0); 2835 } 2836 2837 // Push self-frame. We get here with a return address on the 2838 // stack, so rsp is 8-byte aligned until we allocate our frame. 2839 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog! 2840 2841 // No callee saved registers. rbp is assumed implicitly saved 2842 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 2843 2844 // compiler left unloaded_class_index in j_rarg0 move to where the 2845 // runtime expects it. 2846 __ movl(c_rarg1, j_rarg0); 2847 2848 __ set_last_Java_frame(noreg, noreg, NULL); 2849 2850 // Call C code. Need thread but NOT official VM entry 2851 // crud. We cannot block on this call, no GC can happen. Call should 2852 // capture callee-saved registers as well as return values. 2853 // Thread is in rdi already. 2854 // 2855 // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index); 2856 2857 __ mov(c_rarg0, r15_thread); 2858 __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap); 2859 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2860 2861 // Set an oopmap for the call site 2862 OopMapSet* oop_maps = new OopMapSet(); 2863 OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0); 2864 2865 // location of rbp is known implicitly by the frame sender code 2866 2867 oop_maps->add_gc_map(__ pc() - start, map); 2868 2869 __ reset_last_Java_frame(false); 2870 2871 // Load UnrollBlock* into rdi 2872 __ mov(rdi, rax); 2873 2874 #ifdef ASSERT 2875 { Label L; 2876 __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()), 2877 (int32_t)Deoptimization::Unpack_uncommon_trap); 2878 __ jcc(Assembler::equal, L); 2879 __ stop("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap"); 2880 __ bind(L); 2881 } 2882 #endif 2883 2884 // Pop all the frames we must move/replace. 2885 // 2886 // Frame picture (youngest to oldest) 2887 // 1: self-frame (no frame link) 2888 // 2: deopting frame (no frame link) 2889 // 3: caller of deopting frame (could be compiled/interpreted). 2890 2891 // Pop self-frame. We have no frame, and must rely only on rax and rsp. 2892 __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog! 2893 2894 // Pop deoptimized frame (int) 2895 __ movl(rcx, Address(rdi, 2896 Deoptimization::UnrollBlock:: 2897 size_of_deoptimized_frame_offset_in_bytes())); 2898 __ addptr(rsp, rcx); 2899 2900 // rsp should be pointing at the return address to the caller (3) 2901 2902 // Pick up the initial fp we should save 2903 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2904 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes())); 2905 2906 #ifdef ASSERT 2907 // Compilers generate code that bang the stack by as much as the 2908 // interpreter would need. So this stack banging should never 2909 // trigger a fault. Verify that it does not on non product builds. 2910 __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes())); 2911 __ bang_stack_size(rbx, rcx); 2912 #endif 2913 2914 // Load address of array of frame pcs into rcx (address*) 2915 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes())); 2916 2917 // Trash the return pc 2918 __ addptr(rsp, wordSize); 2919 2920 // Load address of array of frame sizes into rsi (intptr_t*) 2921 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes())); 2922 2923 // Counter 2924 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int) 2925 2926 // Now adjust the caller's stack to make up for the extra locals but 2927 // record the original sp so that we can save it in the skeletal 2928 // interpreter frame and the stack walking of interpreter_sender 2929 // will get the unextended sp value and not the "real" sp value. 2930 2931 const Register sender_sp = r8; 2932 2933 __ mov(sender_sp, rsp); 2934 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int) 2935 __ subptr(rsp, rbx); 2936 2937 // Push interpreter frames in a loop 2938 Label loop; 2939 __ bind(loop); 2940 __ movptr(rbx, Address(rsi, 0)); // Load frame size 2941 __ subptr(rbx, 2 * wordSize); // We'll push pc and rbp by hand 2942 __ pushptr(Address(rcx, 0)); // Save return address 2943 __ enter(); // Save old & set new rbp 2944 __ subptr(rsp, rbx); // Prolog 2945 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), 2946 sender_sp); // Make it walkable 2947 // This value is corrected by layout_activation_impl 2948 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD ); 2949 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 2950 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 2951 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 2952 __ decrementl(rdx); // Decrement counter 2953 __ jcc(Assembler::notZero, loop); 2954 __ pushptr(Address(rcx, 0)); // Save final return address 2955 2956 // Re-push self-frame 2957 __ enter(); // Save old & set new rbp 2958 __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt); 2959 // Prolog 2960 2961 // Use rbp because the frames look interpreted now 2962 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 2963 // Don't need the precise return PC here, just precise enough to point into this code blob. 2964 address the_pc = __ pc(); 2965 __ set_last_Java_frame(noreg, rbp, the_pc); 2966 2967 // Call C code. Need thread but NOT official VM entry 2968 // crud. We cannot block on this call, no GC can happen. Call should 2969 // restore return values to their stack-slots with the new SP. 2970 // Thread is in rdi already. 2971 // 2972 // BasicType unpack_frames(JavaThread* thread, int exec_mode); 2973 2974 __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI 2975 __ mov(c_rarg0, r15_thread); 2976 __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap); 2977 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 2978 2979 // Set an oopmap for the call site 2980 // Use the same PC we used for the last java frame 2981 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 2982 2983 // Clear fp AND pc 2984 __ reset_last_Java_frame(true); 2985 2986 // Pop self-frame. 2987 __ leave(); // Epilog 2988 2989 // Jump to interpreter 2990 __ ret(0); 2991 2992 // Make sure all code is generated 2993 masm->flush(); 2994 2995 _uncommon_trap_blob = UncommonTrapBlob::create(&buffer, oop_maps, 2996 SimpleRuntimeFrame::framesize >> 1); 2997 } 2998 #endif // COMPILER2 2999 3000 //------------------------------generate_handler_blob------ 3001 // 3002 // Generate a special Compile2Runtime blob that saves all registers, 3003 // and setup oopmap. 3004 // 3005 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) { 3006 assert(StubRoutines::forward_exception_entry() != NULL, 3007 "must be generated before"); 3008 3009 ResourceMark rm; 3010 OopMapSet *oop_maps = new OopMapSet(); 3011 OopMap* map; 3012 3013 // Allocate space for the code. Setup code generation tools. 3014 CodeBuffer buffer("handler_blob", 2048, 1024); 3015 MacroAssembler* masm = new MacroAssembler(&buffer); 3016 3017 address start = __ pc(); 3018 address call_pc = NULL; 3019 int frame_size_in_words; 3020 bool cause_return = (poll_type == POLL_AT_RETURN); 3021 bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP); 3022 3023 if (UseRTMLocking) { 3024 // Abort RTM transaction before calling runtime 3025 // because critical section will be large and will be 3026 // aborted anyway. Also nmethod could be deoptimized. 3027 __ xabort(0); 3028 } 3029 3030 // Make room for return address (or push it again) 3031 if (!cause_return) { 3032 __ push(rbx); 3033 } 3034 3035 // Save registers, fpu state, and flags 3036 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors); 3037 3038 // The following is basically a call_VM. However, we need the precise 3039 // address of the call in order to generate an oopmap. Hence, we do all the 3040 // work outselves. 3041 3042 __ set_last_Java_frame(noreg, noreg, NULL); 3043 3044 // The return address must always be correct so that frame constructor never 3045 // sees an invalid pc. 3046 3047 if (!cause_return) { 3048 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack. 3049 // Additionally, rbx is a callee saved register and we can look at it later to determine 3050 // if someone changed the return address for us! 3051 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset())); 3052 __ movptr(Address(rbp, wordSize), rbx); 3053 } 3054 3055 // Do the call 3056 __ mov(c_rarg0, r15_thread); 3057 __ call(RuntimeAddress(call_ptr)); 3058 3059 // Set an oopmap for the call site. This oopmap will map all 3060 // oop-registers and debug-info registers as callee-saved. This 3061 // will allow deoptimization at this safepoint to find all possible 3062 // debug-info recordings, as well as let GC find all oops. 3063 3064 oop_maps->add_gc_map( __ pc() - start, map); 3065 3066 Label noException; 3067 3068 __ reset_last_Java_frame(false); 3069 3070 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD); 3071 __ jcc(Assembler::equal, noException); 3072 3073 // Exception pending 3074 3075 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3076 3077 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3078 3079 // No exception case 3080 __ bind(noException); 3081 3082 Label no_adjust; 3083 #ifdef ASSERT 3084 Label bail; 3085 #endif 3086 if (!cause_return) { 3087 Label no_prefix, not_special; 3088 3089 // If our stashed return pc was modified by the runtime we avoid touching it 3090 __ cmpptr(rbx, Address(rbp, wordSize)); 3091 __ jccb(Assembler::notEqual, no_adjust); 3092 3093 // Skip over the poll instruction. 3094 // See NativeInstruction::is_safepoint_poll() 3095 // Possible encodings: 3096 // 85 00 test %eax,(%rax) 3097 // 85 01 test %eax,(%rcx) 3098 // 85 02 test %eax,(%rdx) 3099 // 85 03 test %eax,(%rbx) 3100 // 85 06 test %eax,(%rsi) 3101 // 85 07 test %eax,(%rdi) 3102 // 3103 // 41 85 00 test %eax,(%r8) 3104 // 41 85 01 test %eax,(%r9) 3105 // 41 85 02 test %eax,(%r10) 3106 // 41 85 03 test %eax,(%r11) 3107 // 41 85 06 test %eax,(%r14) 3108 // 41 85 07 test %eax,(%r15) 3109 // 3110 // 85 04 24 test %eax,(%rsp) 3111 // 41 85 04 24 test %eax,(%r12) 3112 // 85 45 00 test %eax,0x0(%rbp) 3113 // 41 85 45 00 test %eax,0x0(%r13) 3114 3115 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix); 3116 __ jcc(Assembler::notEqual, no_prefix); 3117 __ addptr(rbx, 1); 3118 __ bind(no_prefix); 3119 #ifdef ASSERT 3120 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below 3121 #endif 3122 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values: 3123 // r12/rsp 0x04 3124 // r13/rbp 0x05 3125 __ movzbq(rcx, Address(rbx, 1)); 3126 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05 3127 __ subptr(rcx, 4); // looking for 0x00 .. 0x01 3128 __ cmpptr(rcx, 1); 3129 __ jcc(Assembler::above, not_special); 3130 __ addptr(rbx, 1); 3131 __ bind(not_special); 3132 #ifdef ASSERT 3133 // Verify the correct encoding of the poll we're about to skip. 3134 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl); 3135 __ jcc(Assembler::notEqual, bail); 3136 // Mask out the modrm bits 3137 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask); 3138 // rax encodes to 0, so if the bits are nonzero it's incorrect 3139 __ jcc(Assembler::notZero, bail); 3140 #endif 3141 // Adjust return pc forward to step over the safepoint poll instruction 3142 __ addptr(rbx, 2); 3143 __ movptr(Address(rbp, wordSize), rbx); 3144 } 3145 3146 __ bind(no_adjust); 3147 // Normal exit, restore registers and exit. 3148 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3149 __ ret(0); 3150 3151 #ifdef ASSERT 3152 __ bind(bail); 3153 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected"); 3154 #endif 3155 3156 // Make sure all code is generated 3157 masm->flush(); 3158 3159 // Fill-out other meta info 3160 return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words); 3161 } 3162 3163 // 3164 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss 3165 // 3166 // Generate a stub that calls into vm to find out the proper destination 3167 // of a java call. All the argument registers are live at this point 3168 // but since this is generic code we don't know what they are and the caller 3169 // must do any gc of the args. 3170 // 3171 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) { 3172 assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before"); 3173 3174 // allocate space for the code 3175 ResourceMark rm; 3176 3177 CodeBuffer buffer(name, 1000, 512); 3178 MacroAssembler* masm = new MacroAssembler(&buffer); 3179 3180 int frame_size_in_words; 3181 3182 OopMapSet *oop_maps = new OopMapSet(); 3183 OopMap* map = NULL; 3184 3185 int start = __ offset(); 3186 3187 // No need to save vector registers since they are caller-saved anyway. 3188 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false); 3189 3190 int frame_complete = __ offset(); 3191 3192 __ set_last_Java_frame(noreg, noreg, NULL); 3193 3194 __ mov(c_rarg0, r15_thread); 3195 3196 __ call(RuntimeAddress(destination)); 3197 3198 3199 // Set an oopmap for the call site. 3200 // We need this not only for callee-saved registers, but also for volatile 3201 // registers that the compiler might be keeping live across a safepoint. 3202 3203 oop_maps->add_gc_map( __ offset() - start, map); 3204 3205 // rax contains the address we are going to jump to assuming no exception got installed 3206 3207 // clear last_Java_sp 3208 __ reset_last_Java_frame(false); 3209 // check for pending exceptions 3210 Label pending; 3211 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD); 3212 __ jcc(Assembler::notEqual, pending); 3213 3214 // get the returned Method* 3215 __ get_vm_result_2(rbx, r15_thread); 3216 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx); 3217 3218 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 3219 3220 RegisterSaver::restore_live_registers(masm); 3221 3222 // We are back the the original state on entry and ready to go. 3223 3224 __ jmp(rax); 3225 3226 // Pending exception after the safepoint 3227 3228 __ bind(pending); 3229 3230 RegisterSaver::restore_live_registers(masm); 3231 3232 // exception pending => remove activation and forward to exception handler 3233 3234 __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD); 3235 3236 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 3237 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3238 3239 // ------------- 3240 // make sure all code is generated 3241 masm->flush(); 3242 3243 // return the blob 3244 // frame_size_words or bytes?? 3245 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); 3246 } 3247 3248 #ifdef COMPILER2 3249 static const int native_invoker_code_size = MethodHandles::adapter_code_size; 3250 3251 class NativeInvokerGenerator : public StubCodeGenerator { 3252 address _call_target; 3253 int _shadow_space_bytes; 3254 3255 const GrowableArray<VMReg>& _input_registers; 3256 const GrowableArray<VMReg>& _output_registers; 3257 3258 int _frame_complete; 3259 int _framesize; 3260 OopMapSet* _oop_maps; 3261 public: 3262 NativeInvokerGenerator(CodeBuffer* buffer, 3263 address call_target, 3264 int shadow_space_bytes, 3265 const GrowableArray<VMReg>& input_registers, 3266 const GrowableArray<VMReg>& output_registers) 3267 : StubCodeGenerator(buffer, PrintMethodHandleStubs), 3268 _call_target(call_target), 3269 _shadow_space_bytes(shadow_space_bytes), 3270 _input_registers(input_registers), 3271 _output_registers(output_registers), 3272 _frame_complete(0), 3273 _framesize(0), 3274 _oop_maps(NULL) { 3275 assert(_output_registers.length() <= 1 3276 || (_output_registers.length() == 2 && !_output_registers.at(1)->is_valid()), "no multi-reg returns"); 3277 3278 } 3279 3280 void generate(); 3281 3282 int spill_size_in_bytes() const { 3283 if (_output_registers.length() == 0) { 3284 return 0; 3285 } 3286 VMReg reg = _output_registers.at(0); 3287 assert(reg->is_reg(), "must be a register"); 3288 if (reg->is_Register()) { 3289 return 8; 3290 } else if (reg->is_XMMRegister()) { 3291 if (UseAVX >= 3) { 3292 return 64; 3293 } else if (UseAVX >= 1) { 3294 return 32; 3295 } else { 3296 return 16; 3297 } 3298 } else { 3299 ShouldNotReachHere(); 3300 } 3301 return 0; 3302 } 3303 3304 void spill_out_registers() { 3305 if (_output_registers.length() == 0) { 3306 return; 3307 } 3308 VMReg reg = _output_registers.at(0); 3309 assert(reg->is_reg(), "must be a register"); 3310 MacroAssembler* masm = _masm; 3311 if (reg->is_Register()) { 3312 __ movptr(Address(rsp, 0), reg->as_Register()); 3313 } else if (reg->is_XMMRegister()) { 3314 if (UseAVX >= 3) { 3315 __ evmovdqul(Address(rsp, 0), reg->as_XMMRegister(), Assembler::AVX_512bit); 3316 } else if (UseAVX >= 1) { 3317 __ vmovdqu(Address(rsp, 0), reg->as_XMMRegister()); 3318 } else { 3319 __ movdqu(Address(rsp, 0), reg->as_XMMRegister()); 3320 } 3321 } else { 3322 ShouldNotReachHere(); 3323 } 3324 } 3325 3326 void fill_out_registers() { 3327 if (_output_registers.length() == 0) { 3328 return; 3329 } 3330 VMReg reg = _output_registers.at(0); 3331 assert(reg->is_reg(), "must be a register"); 3332 MacroAssembler* masm = _masm; 3333 if (reg->is_Register()) { 3334 __ movptr(reg->as_Register(), Address(rsp, 0)); 3335 } else if (reg->is_XMMRegister()) { 3336 if (UseAVX >= 3) { 3337 __ evmovdqul(reg->as_XMMRegister(), Address(rsp, 0), Assembler::AVX_512bit); 3338 } else if (UseAVX >= 1) { 3339 __ vmovdqu(reg->as_XMMRegister(), Address(rsp, 0)); 3340 } else { 3341 __ movdqu(reg->as_XMMRegister(), Address(rsp, 0)); 3342 } 3343 } else { 3344 ShouldNotReachHere(); 3345 } 3346 } 3347 3348 int frame_complete() const { 3349 return _frame_complete; 3350 } 3351 3352 int framesize() const { 3353 return (_framesize >> (LogBytesPerWord - LogBytesPerInt)); 3354 } 3355 3356 OopMapSet* oop_maps() const { 3357 return _oop_maps; 3358 } 3359 3360 private: 3361 #ifdef ASSERT 3362 bool target_uses_register(VMReg reg) { 3363 return _input_registers.contains(reg) || _output_registers.contains(reg); 3364 } 3365 #endif 3366 }; 3367 3368 RuntimeStub* SharedRuntime::make_native_invoker(address call_target, 3369 int shadow_space_bytes, 3370 const GrowableArray<VMReg>& input_registers, 3371 const GrowableArray<VMReg>& output_registers) { 3372 int locs_size = 64; 3373 CodeBuffer code("nep_invoker_blob", native_invoker_code_size, locs_size); 3374 NativeInvokerGenerator g(&code, call_target, shadow_space_bytes, input_registers, output_registers); 3375 g.generate(); 3376 code.log_section_sizes("nep_invoker_blob"); 3377 3378 RuntimeStub* stub = 3379 RuntimeStub::new_runtime_stub("nep_invoker_blob", 3380 &code, 3381 g.frame_complete(), 3382 g.framesize(), 3383 g.oop_maps(), false); 3384 return stub; 3385 } 3386 3387 void NativeInvokerGenerator::generate() { 3388 assert(!(target_uses_register(r15_thread->as_VMReg()) || target_uses_register(rscratch1->as_VMReg())), "Register conflict"); 3389 3390 enum layout { 3391 rbp_off, 3392 rbp_off2, 3393 return_off, 3394 return_off2, 3395 framesize // inclusive of return address 3396 }; 3397 3398 _framesize = align_up(framesize + ((_shadow_space_bytes + spill_size_in_bytes()) >> LogBytesPerInt), 4); 3399 assert(is_even(_framesize/2), "sp not 16-byte aligned"); 3400 3401 _oop_maps = new OopMapSet(); 3402 MacroAssembler* masm = _masm; 3403 3404 address start = __ pc(); 3405 3406 __ enter(); 3407 3408 // return address and rbp are already in place 3409 __ subptr(rsp, (_framesize-4) << LogBytesPerInt); // prolog 3410 3411 _frame_complete = __ pc() - start; 3412 3413 address the_pc = __ pc(); 3414 3415 __ set_last_Java_frame(rsp, rbp, (address)the_pc); 3416 OopMap* map = new OopMap(_framesize, 0); 3417 _oop_maps->add_gc_map(the_pc - start, map); 3418 3419 // State transition 3420 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 3421 3422 __ call(RuntimeAddress(_call_target)); 3423 3424 __ restore_cpu_control_state_after_jni(); 3425 3426 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 3427 3428 // Force this write out before the read below 3429 __ membar(Assembler::Membar_mask_bits( 3430 Assembler::LoadLoad | Assembler::LoadStore | 3431 Assembler::StoreLoad | Assembler::StoreStore)); 3432 3433 Label L_after_safepoint_poll; 3434 Label L_safepoint_poll_slow_path; 3435 3436 __ safepoint_poll(L_safepoint_poll_slow_path, r15_thread, true /* at_return */, false /* in_nmethod */); 3437 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 3438 __ jcc(Assembler::notEqual, L_safepoint_poll_slow_path); 3439 3440 __ bind(L_after_safepoint_poll); 3441 3442 // change thread state 3443 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 3444 3445 __ block_comment("reguard stack check"); 3446 Label L_reguard; 3447 Label L_after_reguard; 3448 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 3449 __ jcc(Assembler::equal, L_reguard); 3450 __ bind(L_after_reguard); 3451 3452 __ reset_last_Java_frame(r15_thread, true); 3453 3454 __ leave(); // required for proper stackwalking of RuntimeStub frame 3455 __ ret(0); 3456 3457 ////////////////////////////////////////////////////////////////////////////// 3458 3459 __ block_comment("{ L_safepoint_poll_slow_path"); 3460 __ bind(L_safepoint_poll_slow_path); 3461 __ vzeroupper(); 3462 3463 spill_out_registers(); 3464 3465 __ mov(c_rarg0, r15_thread); 3466 __ mov(r12, rsp); // remember sp 3467 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 3468 __ andptr(rsp, -16); // align stack as required by ABI 3469 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 3470 __ mov(rsp, r12); // restore sp 3471 __ reinit_heapbase(); 3472 3473 fill_out_registers(); 3474 3475 __ jmp(L_after_safepoint_poll); 3476 __ block_comment("} L_safepoint_poll_slow_path"); 3477 3478 ////////////////////////////////////////////////////////////////////////////// 3479 3480 __ block_comment("{ L_reguard"); 3481 __ bind(L_reguard); 3482 __ vzeroupper(); 3483 3484 spill_out_registers(); 3485 3486 __ mov(r12, rsp); // remember sp 3487 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 3488 __ andptr(rsp, -16); // align stack as required by ABI 3489 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 3490 __ mov(rsp, r12); // restore sp 3491 __ reinit_heapbase(); 3492 3493 fill_out_registers(); 3494 3495 __ jmp(L_after_reguard); 3496 3497 __ block_comment("} L_reguard"); 3498 3499 ////////////////////////////////////////////////////////////////////////////// 3500 3501 __ flush(); 3502 } 3503 #endif // COMPILER2 3504 3505 //------------------------------Montgomery multiplication------------------------ 3506 // 3507 3508 #ifndef _WINDOWS 3509 3510 // Subtract 0:b from carry:a. Return carry. 3511 static julong 3512 sub(julong a[], julong b[], julong carry, long len) { 3513 long long i = 0, cnt = len; 3514 julong tmp; 3515 asm volatile("clc; " 3516 "0: ; " 3517 "mov (%[b], %[i], 8), %[tmp]; " 3518 "sbb %[tmp], (%[a], %[i], 8); " 3519 "inc %[i]; dec %[cnt]; " 3520 "jne 0b; " 3521 "mov %[carry], %[tmp]; sbb $0, %[tmp]; " 3522 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp) 3523 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry) 3524 : "memory"); 3525 return tmp; 3526 } 3527 3528 // Multiply (unsigned) Long A by Long B, accumulating the double- 3529 // length result into the accumulator formed of T0, T1, and T2. 3530 #define MACC(A, B, T0, T1, T2) \ 3531 do { \ 3532 unsigned long hi, lo; \ 3533 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3534 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3535 : "r"(A), "a"(B) : "cc"); \ 3536 } while(0) 3537 3538 // As above, but add twice the double-length result into the 3539 // accumulator. 3540 #define MACC2(A, B, T0, T1, T2) \ 3541 do { \ 3542 unsigned long hi, lo; \ 3543 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \ 3544 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3545 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3546 : "r"(A), "a"(B) : "cc"); \ 3547 } while(0) 3548 3549 #else //_WINDOWS 3550 3551 static julong 3552 sub(julong a[], julong b[], julong carry, long len) { 3553 long i; 3554 julong tmp; 3555 unsigned char c = 1; 3556 for (i = 0; i < len; i++) { 3557 c = _addcarry_u64(c, a[i], ~b[i], &tmp); 3558 a[i] = tmp; 3559 } 3560 c = _addcarry_u64(c, carry, ~0, &tmp); 3561 return tmp; 3562 } 3563 3564 // Multiply (unsigned) Long A by Long B, accumulating the double- 3565 // length result into the accumulator formed of T0, T1, and T2. 3566 #define MACC(A, B, T0, T1, T2) \ 3567 do { \ 3568 julong hi, lo; \ 3569 lo = _umul128(A, B, &hi); \ 3570 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3571 c = _addcarry_u64(c, hi, T1, &T1); \ 3572 _addcarry_u64(c, T2, 0, &T2); \ 3573 } while(0) 3574 3575 // As above, but add twice the double-length result into the 3576 // accumulator. 3577 #define MACC2(A, B, T0, T1, T2) \ 3578 do { \ 3579 julong hi, lo; \ 3580 lo = _umul128(A, B, &hi); \ 3581 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3582 c = _addcarry_u64(c, hi, T1, &T1); \ 3583 _addcarry_u64(c, T2, 0, &T2); \ 3584 c = _addcarry_u64(0, lo, T0, &T0); \ 3585 c = _addcarry_u64(c, hi, T1, &T1); \ 3586 _addcarry_u64(c, T2, 0, &T2); \ 3587 } while(0) 3588 3589 #endif //_WINDOWS 3590 3591 // Fast Montgomery multiplication. The derivation of the algorithm is 3592 // in A Cryptographic Library for the Motorola DSP56000, 3593 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3594 3595 static void NOINLINE 3596 montgomery_multiply(julong a[], julong b[], julong n[], 3597 julong m[], julong inv, int len) { 3598 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3599 int i; 3600 3601 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply"); 3602 3603 for (i = 0; i < len; i++) { 3604 int j; 3605 for (j = 0; j < i; j++) { 3606 MACC(a[j], b[i-j], t0, t1, t2); 3607 MACC(m[j], n[i-j], t0, t1, t2); 3608 } 3609 MACC(a[i], b[0], t0, t1, t2); 3610 m[i] = t0 * inv; 3611 MACC(m[i], n[0], t0, t1, t2); 3612 3613 assert(t0 == 0, "broken Montgomery multiply"); 3614 3615 t0 = t1; t1 = t2; t2 = 0; 3616 } 3617 3618 for (i = len; i < 2*len; i++) { 3619 int j; 3620 for (j = i-len+1; j < len; j++) { 3621 MACC(a[j], b[i-j], t0, t1, t2); 3622 MACC(m[j], n[i-j], t0, t1, t2); 3623 } 3624 m[i-len] = t0; 3625 t0 = t1; t1 = t2; t2 = 0; 3626 } 3627 3628 while (t0) 3629 t0 = sub(m, n, t0, len); 3630 } 3631 3632 // Fast Montgomery squaring. This uses asymptotically 25% fewer 3633 // multiplies so it should be up to 25% faster than Montgomery 3634 // multiplication. However, its loop control is more complex and it 3635 // may actually run slower on some machines. 3636 3637 static void NOINLINE 3638 montgomery_square(julong a[], julong n[], 3639 julong m[], julong inv, int len) { 3640 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3641 int i; 3642 3643 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square"); 3644 3645 for (i = 0; i < len; i++) { 3646 int j; 3647 int end = (i+1)/2; 3648 for (j = 0; j < end; j++) { 3649 MACC2(a[j], a[i-j], t0, t1, t2); 3650 MACC(m[j], n[i-j], t0, t1, t2); 3651 } 3652 if ((i & 1) == 0) { 3653 MACC(a[j], a[j], t0, t1, t2); 3654 } 3655 for (; j < i; j++) { 3656 MACC(m[j], n[i-j], t0, t1, t2); 3657 } 3658 m[i] = t0 * inv; 3659 MACC(m[i], n[0], t0, t1, t2); 3660 3661 assert(t0 == 0, "broken Montgomery square"); 3662 3663 t0 = t1; t1 = t2; t2 = 0; 3664 } 3665 3666 for (i = len; i < 2*len; i++) { 3667 int start = i-len+1; 3668 int end = start + (len - start)/2; 3669 int j; 3670 for (j = start; j < end; j++) { 3671 MACC2(a[j], a[i-j], t0, t1, t2); 3672 MACC(m[j], n[i-j], t0, t1, t2); 3673 } 3674 if ((i & 1) == 0) { 3675 MACC(a[j], a[j], t0, t1, t2); 3676 } 3677 for (; j < len; j++) { 3678 MACC(m[j], n[i-j], t0, t1, t2); 3679 } 3680 m[i-len] = t0; 3681 t0 = t1; t1 = t2; t2 = 0; 3682 } 3683 3684 while (t0) 3685 t0 = sub(m, n, t0, len); 3686 } 3687 3688 // Swap words in a longword. 3689 static julong swap(julong x) { 3690 return (x << 32) | (x >> 32); 3691 } 3692 3693 // Copy len longwords from s to d, word-swapping as we go. The 3694 // destination array is reversed. 3695 static void reverse_words(julong *s, julong *d, int len) { 3696 d += len; 3697 while(len-- > 0) { 3698 d--; 3699 *d = swap(*s); 3700 s++; 3701 } 3702 } 3703 3704 // The threshold at which squaring is advantageous was determined 3705 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. 3706 #define MONTGOMERY_SQUARING_THRESHOLD 64 3707 3708 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, 3709 jint len, jlong inv, 3710 jint *m_ints) { 3711 assert(len % 2 == 0, "array length in montgomery_multiply must be even"); 3712 int longwords = len/2; 3713 3714 // Make very sure we don't use so much space that the stack might 3715 // overflow. 512 jints corresponds to an 16384-bit integer and 3716 // will use here a total of 8k bytes of stack space. 3717 int total_allocation = longwords * sizeof (julong) * 4; 3718 guarantee(total_allocation <= 8192, "must be"); 3719 julong *scratch = (julong *)alloca(total_allocation); 3720 3721 // Local scratch arrays 3722 julong 3723 *a = scratch + 0 * longwords, 3724 *b = scratch + 1 * longwords, 3725 *n = scratch + 2 * longwords, 3726 *m = scratch + 3 * longwords; 3727 3728 reverse_words((julong *)a_ints, a, longwords); 3729 reverse_words((julong *)b_ints, b, longwords); 3730 reverse_words((julong *)n_ints, n, longwords); 3731 3732 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords); 3733 3734 reverse_words(m, (julong *)m_ints, longwords); 3735 } 3736 3737 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, 3738 jint len, jlong inv, 3739 jint *m_ints) { 3740 assert(len % 2 == 0, "array length in montgomery_square must be even"); 3741 int longwords = len/2; 3742 3743 // Make very sure we don't use so much space that the stack might 3744 // overflow. 512 jints corresponds to an 16384-bit integer and 3745 // will use here a total of 6k bytes of stack space. 3746 int total_allocation = longwords * sizeof (julong) * 3; 3747 guarantee(total_allocation <= 8192, "must be"); 3748 julong *scratch = (julong *)alloca(total_allocation); 3749 3750 // Local scratch arrays 3751 julong 3752 *a = scratch + 0 * longwords, 3753 *n = scratch + 1 * longwords, 3754 *m = scratch + 2 * longwords; 3755 3756 reverse_words((julong *)a_ints, a, longwords); 3757 reverse_words((julong *)n_ints, n, longwords); 3758 3759 if (len >= MONTGOMERY_SQUARING_THRESHOLD) { 3760 ::montgomery_square(a, n, m, (julong)inv, longwords); 3761 } else { 3762 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords); 3763 } 3764 3765 reverse_words(m, (julong *)m_ints, longwords); 3766 } 3767 3768 #ifdef COMPILER2 3769 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame 3770 // 3771 //------------------------------generate_exception_blob--------------------------- 3772 // creates exception blob at the end 3773 // Using exception blob, this code is jumped from a compiled method. 3774 // (see emit_exception_handler in x86_64.ad file) 3775 // 3776 // Given an exception pc at a call we call into the runtime for the 3777 // handler in this method. This handler might merely restore state 3778 // (i.e. callee save registers) unwind the frame and jump to the 3779 // exception handler for the nmethod if there is no Java level handler 3780 // for the nmethod. 3781 // 3782 // This code is entered with a jmp. 3783 // 3784 // Arguments: 3785 // rax: exception oop 3786 // rdx: exception pc 3787 // 3788 // Results: 3789 // rax: exception oop 3790 // rdx: exception pc in caller or ??? 3791 // destination: exception handler of caller 3792 // 3793 // Note: the exception pc MUST be at a call (precise debug information) 3794 // Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved. 3795 // 3796 3797 void OptoRuntime::generate_exception_blob() { 3798 assert(!OptoRuntime::is_callee_saved_register(RDX_num), ""); 3799 assert(!OptoRuntime::is_callee_saved_register(RAX_num), ""); 3800 assert(!OptoRuntime::is_callee_saved_register(RCX_num), ""); 3801 3802 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 3803 3804 // Allocate space for the code 3805 ResourceMark rm; 3806 // Setup code generation tools 3807 CodeBuffer buffer("exception_blob", 2048, 1024); 3808 MacroAssembler* masm = new MacroAssembler(&buffer); 3809 3810 3811 address start = __ pc(); 3812 3813 // Exception pc is 'return address' for stack walker 3814 __ push(rdx); 3815 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog 3816 3817 // Save callee-saved registers. See x86_64.ad. 3818 3819 // rbp is an implicitly saved callee saved register (i.e., the calling 3820 // convention will save/restore it in the prolog/epilog). Other than that 3821 // there are no callee save registers now that adapter frames are gone. 3822 3823 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 3824 3825 // Store exception in Thread object. We cannot pass any arguments to the 3826 // handle_exception call, since we do not want to make any assumption 3827 // about the size of the frame where the exception happened in. 3828 // c_rarg0 is either rdi (Linux) or rcx (Windows). 3829 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax); 3830 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 3831 3832 // This call does all the hard work. It checks if an exception handler 3833 // exists in the method. 3834 // If so, it returns the handler address. 3835 // If not, it prepares for stack-unwinding, restoring the callee-save 3836 // registers of the frame being removed. 3837 // 3838 // address OptoRuntime::handle_exception_C(JavaThread* thread) 3839 3840 // At a method handle call, the stack may not be properly aligned 3841 // when returning with an exception. 3842 address the_pc = __ pc(); 3843 __ set_last_Java_frame(noreg, noreg, the_pc); 3844 __ mov(c_rarg0, r15_thread); 3845 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack 3846 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C))); 3847 3848 // Set an oopmap for the call site. This oopmap will only be used if we 3849 // are unwinding the stack. Hence, all locations will be dead. 3850 // Callee-saved registers will be the same as the frame above (i.e., 3851 // handle_exception_stub), since they were restored when we got the 3852 // exception. 3853 3854 OopMapSet* oop_maps = new OopMapSet(); 3855 3856 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 3857 3858 __ reset_last_Java_frame(false); 3859 3860 // Restore callee-saved registers 3861 3862 // rbp is an implicitly saved callee-saved register (i.e., the calling 3863 // convention will save restore it in prolog/epilog) Other than that 3864 // there are no callee save registers now that adapter frames are gone. 3865 3866 __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt)); 3867 3868 __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog 3869 __ pop(rdx); // No need for exception pc anymore 3870 3871 // rax: exception handler 3872 3873 // We have a handler in rax (could be deopt blob). 3874 __ mov(r8, rax); 3875 3876 // Get the exception oop 3877 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 3878 // Get the exception pc in case we are deoptimized 3879 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 3880 #ifdef ASSERT 3881 __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD); 3882 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD); 3883 #endif 3884 // Clear the exception oop so GC no longer processes it as a root. 3885 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD); 3886 3887 // rax: exception oop 3888 // r8: exception handler 3889 // rdx: exception pc 3890 // Jump to handler 3891 3892 __ jmp(r8); 3893 3894 // Make sure all code is generated 3895 masm->flush(); 3896 3897 // Set exception blob 3898 _exception_blob = ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1); 3899 } 3900 #endif // COMPILER2 3901 3902 void SharedRuntime::compute_move_order(const BasicType* in_sig_bt, 3903 int total_in_args, const VMRegPair* in_regs, 3904 int total_out_args, VMRegPair* out_regs, 3905 GrowableArray<int>& arg_order, 3906 VMRegPair tmp_vmreg) { 3907 ComputeMoveOrder order(total_in_args, in_regs, 3908 total_out_args, out_regs, 3909 in_sig_bt, arg_order, tmp_vmreg); 3910 }