1 /* 2 * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #ifndef _WINDOWS 27 #include "alloca.h" 28 #endif 29 #include "asm/macroAssembler.hpp" 30 #include "asm/macroAssembler.inline.hpp" 31 #include "code/debugInfoRec.hpp" 32 #include "code/icBuffer.hpp" 33 #include "code/nativeInst.hpp" 34 #include "code/vtableStubs.hpp" 35 #include "compiler/oopMap.hpp" 36 #include "gc/shared/collectedHeap.hpp" 37 #include "gc/shared/gcLocker.hpp" 38 #include "gc/shared/barrierSet.hpp" 39 #include "gc/shared/barrierSetAssembler.hpp" 40 #include "interpreter/interpreter.hpp" 41 #include "logging/log.hpp" 42 #include "memory/resourceArea.hpp" 43 #include "memory/universe.hpp" 44 #include "oops/compiledICHolder.hpp" 45 #include "oops/klass.inline.hpp" 46 #include "prims/methodHandles.hpp" 47 #include "runtime/jniHandles.hpp" 48 #include "runtime/safepointMechanism.hpp" 49 #include "runtime/sharedRuntime.hpp" 50 #include "runtime/signature.hpp" 51 #include "runtime/stubRoutines.hpp" 52 #include "runtime/vframeArray.hpp" 53 #include "runtime/vm_version.hpp" 54 #include "utilities/align.hpp" 55 #include "utilities/formatBuffer.hpp" 56 #include "vmreg_x86.inline.hpp" 57 #ifdef COMPILER1 58 #include "c1/c1_Runtime1.hpp" 59 #endif 60 #ifdef COMPILER2 61 #include "opto/runtime.hpp" 62 #endif 63 #if INCLUDE_JVMCI 64 #include "jvmci/jvmciJavaClasses.hpp" 65 #endif 66 67 #define __ masm-> 68 69 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size; 70 71 class SimpleRuntimeFrame { 72 73 public: 74 75 // Most of the runtime stubs have this simple frame layout. 76 // This class exists to make the layout shared in one place. 77 // Offsets are for compiler stack slots, which are jints. 78 enum layout { 79 // The frame sender code expects that rbp will be in the "natural" place and 80 // will override any oopMap setting for it. We must therefore force the layout 81 // so that it agrees with the frame sender code. 82 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt, 83 rbp_off2, 84 return_off, return_off2, 85 framesize 86 }; 87 }; 88 89 class RegisterSaver { 90 // Capture info about frame layout. Layout offsets are in jint 91 // units because compiler frame slots are jints. 92 #define XSAVE_AREA_BEGIN 160 93 #define XSAVE_AREA_YMM_BEGIN 576 94 #define XSAVE_AREA_OPMASK_BEGIN 1088 95 #define XSAVE_AREA_ZMM_BEGIN 1152 96 #define XSAVE_AREA_UPPERBANK 1664 97 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off 98 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off 99 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off 100 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off 101 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off 102 enum layout { 103 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area 104 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area 105 DEF_XMM_OFFS(0), 106 DEF_XMM_OFFS(1), 107 // 2..15 are implied in range usage 108 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 109 DEF_YMM_OFFS(0), 110 DEF_YMM_OFFS(1), 111 // 2..15 are implied in range usage 112 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 113 DEF_OPMASK_OFFS(0), 114 DEF_OPMASK_OFFS(1), 115 // 2..7 are implied in range usage 116 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 117 DEF_ZMM_OFFS(0), 118 DEF_ZMM_OFFS(1), 119 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt, 120 DEF_ZMM_UPPER_OFFS(16), 121 DEF_ZMM_UPPER_OFFS(17), 122 // 18..31 are implied in range usage 123 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt), 124 fpu_stateH_end, 125 r15_off, r15H_off, 126 r14_off, r14H_off, 127 r13_off, r13H_off, 128 r12_off, r12H_off, 129 r11_off, r11H_off, 130 r10_off, r10H_off, 131 r9_off, r9H_off, 132 r8_off, r8H_off, 133 rdi_off, rdiH_off, 134 rsi_off, rsiH_off, 135 ignore_off, ignoreH_off, // extra copy of rbp 136 rsp_off, rspH_off, 137 rbx_off, rbxH_off, 138 rdx_off, rdxH_off, 139 rcx_off, rcxH_off, 140 rax_off, raxH_off, 141 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state 142 align_off, alignH_off, 143 flags_off, flagsH_off, 144 // The frame sender code expects that rbp will be in the "natural" place and 145 // will override any oopMap setting for it. We must therefore force the layout 146 // so that it agrees with the frame sender code. 147 rbp_off, rbpH_off, // copy of rbp we will restore 148 return_off, returnH_off, // slot for return address 149 reg_save_size // size in compiler stack slots 150 }; 151 152 public: 153 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors); 154 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false); 155 156 // Offsets into the register save area 157 // Used by deoptimization when it is managing result register 158 // values on its own 159 160 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; } 161 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; } 162 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; } 163 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; } 164 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; } 165 166 // During deoptimization only the result registers need to be restored, 167 // all the other values have already been extracted. 168 static void restore_result_registers(MacroAssembler* masm); 169 }; 170 171 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) { 172 int off = 0; 173 int num_xmm_regs = XMMRegisterImpl::number_of_registers; 174 if (UseAVX < 3) { 175 num_xmm_regs = num_xmm_regs/2; 176 } 177 #if COMPILER2_OR_JVMCI 178 if (save_wide_vectors && UseAVX == 0) { 179 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX 180 } 181 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 182 #else 183 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI 184 #endif 185 186 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated 187 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs); 188 // OopMap frame size is in compiler stack slots (jint's) not bytes or words 189 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 190 // CodeBlob frame size is in words. 191 int frame_size_in_words = frame_size_in_bytes / wordSize; 192 *total_frame_words = frame_size_in_words; 193 194 // Save registers, fpu state, and flags. 195 // We assume caller has already pushed the return address onto the 196 // stack, so rsp is 8-byte aligned here. 197 // We push rpb twice in this sequence because we want the real rbp 198 // to be under the return like a normal enter. 199 200 __ enter(); // rsp becomes 16-byte aligned here 201 __ push_CPU_state(); // Push a multiple of 16 bytes 202 203 // push cpu state handles this on EVEX enabled targets 204 if (save_wide_vectors) { 205 // Save upper half of YMM registers(0..15) 206 int base_addr = XSAVE_AREA_YMM_BEGIN; 207 for (int n = 0; n < 16; n++) { 208 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n)); 209 } 210 if (VM_Version::supports_evex()) { 211 // Save upper half of ZMM registers(0..15) 212 base_addr = XSAVE_AREA_ZMM_BEGIN; 213 for (int n = 0; n < 16; n++) { 214 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n)); 215 } 216 // Save full ZMM registers(16..num_xmm_regs) 217 base_addr = XSAVE_AREA_UPPERBANK; 218 off = 0; 219 int vector_len = Assembler::AVX_512bit; 220 for (int n = 16; n < num_xmm_regs; n++) { 221 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 222 } 223 #if COMPILER2_OR_JVMCI 224 base_addr = XSAVE_AREA_OPMASK_BEGIN; 225 off = 0; 226 for(int n = 0; n < KRegisterImpl::number_of_registers; n++) { 227 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 228 } 229 #endif 230 } 231 } else { 232 if (VM_Version::supports_evex()) { 233 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 234 int base_addr = XSAVE_AREA_UPPERBANK; 235 off = 0; 236 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 237 for (int n = 16; n < num_xmm_regs; n++) { 238 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 239 } 240 #if COMPILER2_OR_JVMCI 241 base_addr = XSAVE_AREA_OPMASK_BEGIN; 242 off = 0; 243 for(int n = 0; n < KRegisterImpl::number_of_registers; n++) { 244 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 245 } 246 #endif 247 } 248 } 249 __ vzeroupper(); 250 if (frame::arg_reg_save_area_bytes != 0) { 251 // Allocate argument register save area 252 __ subptr(rsp, frame::arg_reg_save_area_bytes); 253 } 254 255 // Set an oopmap for the call site. This oopmap will map all 256 // oop-registers and debug-info registers as callee-saved. This 257 // will allow deoptimization at this safepoint to find all possible 258 // debug-info recordings, as well as let GC find all oops. 259 260 OopMapSet *oop_maps = new OopMapSet(); 261 OopMap* map = new OopMap(frame_size_in_slots, 0); 262 263 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x)) 264 265 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg()); 266 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg()); 267 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg()); 268 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg()); 269 // rbp location is known implicitly by the frame sender code, needs no oopmap 270 // and the location where rbp was saved by is ignored 271 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg()); 272 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg()); 273 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg()); 274 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg()); 275 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg()); 276 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg()); 277 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg()); 278 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg()); 279 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg()); 280 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg()); 281 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 282 // on EVEX enabled targets, we get it included in the xsave area 283 off = xmm0_off; 284 int delta = xmm1_off - off; 285 for (int n = 0; n < 16; n++) { 286 XMMRegister xmm_name = as_XMMRegister(n); 287 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()); 288 off += delta; 289 } 290 if (UseAVX > 2) { 291 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 292 off = zmm16_off; 293 delta = zmm17_off - off; 294 for (int n = 16; n < num_xmm_regs; n++) { 295 XMMRegister zmm_name = as_XMMRegister(n); 296 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()); 297 off += delta; 298 } 299 } 300 301 #if COMPILER2_OR_JVMCI 302 if (save_wide_vectors) { 303 // Save upper half of YMM registers(0..15) 304 off = ymm0_off; 305 delta = ymm1_off - ymm0_off; 306 for (int n = 0; n < 16; n++) { 307 XMMRegister ymm_name = as_XMMRegister(n); 308 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4)); 309 off += delta; 310 } 311 if (VM_Version::supports_evex()) { 312 // Save upper half of ZMM registers(0..15) 313 off = zmm0_off; 314 delta = zmm1_off - zmm0_off; 315 for (int n = 0; n < 16; n++) { 316 XMMRegister zmm_name = as_XMMRegister(n); 317 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8)); 318 off += delta; 319 } 320 } 321 } 322 #endif // COMPILER2_OR_JVMCI 323 324 // %%% These should all be a waste but we'll keep things as they were for now 325 if (true) { 326 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next()); 327 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next()); 328 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next()); 329 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next()); 330 // rbp location is known implicitly by the frame sender code, needs no oopmap 331 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next()); 332 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next()); 333 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next()); 334 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next()); 335 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next()); 336 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next()); 337 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next()); 338 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next()); 339 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next()); 340 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next()); 341 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 342 // on EVEX enabled targets, we get it included in the xsave area 343 off = xmm0H_off; 344 delta = xmm1H_off - off; 345 for (int n = 0; n < 16; n++) { 346 XMMRegister xmm_name = as_XMMRegister(n); 347 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next()); 348 off += delta; 349 } 350 if (UseAVX > 2) { 351 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 352 off = zmm16H_off; 353 delta = zmm17H_off - off; 354 for (int n = 16; n < num_xmm_regs; n++) { 355 XMMRegister zmm_name = as_XMMRegister(n); 356 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next()); 357 off += delta; 358 } 359 } 360 } 361 362 return map; 363 } 364 365 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) { 366 int num_xmm_regs = XMMRegisterImpl::number_of_registers; 367 if (UseAVX < 3) { 368 num_xmm_regs = num_xmm_regs/2; 369 } 370 if (frame::arg_reg_save_area_bytes != 0) { 371 // Pop arg register save area 372 __ addptr(rsp, frame::arg_reg_save_area_bytes); 373 } 374 375 #if COMPILER2_OR_JVMCI 376 if (restore_wide_vectors) { 377 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); 378 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 379 } 380 #else 381 assert(!restore_wide_vectors, "vectors are generated only by C2"); 382 #endif 383 384 __ vzeroupper(); 385 386 // On EVEX enabled targets everything is handled in pop fpu state 387 if (restore_wide_vectors) { 388 // Restore upper half of YMM registers (0..15) 389 int base_addr = XSAVE_AREA_YMM_BEGIN; 390 for (int n = 0; n < 16; n++) { 391 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16)); 392 } 393 if (VM_Version::supports_evex()) { 394 // Restore upper half of ZMM registers (0..15) 395 base_addr = XSAVE_AREA_ZMM_BEGIN; 396 for (int n = 0; n < 16; n++) { 397 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32)); 398 } 399 // Restore full ZMM registers(16..num_xmm_regs) 400 base_addr = XSAVE_AREA_UPPERBANK; 401 int vector_len = Assembler::AVX_512bit; 402 int off = 0; 403 for (int n = 16; n < num_xmm_regs; n++) { 404 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 405 } 406 #if COMPILER2_OR_JVMCI 407 base_addr = XSAVE_AREA_OPMASK_BEGIN; 408 off = 0; 409 for (int n = 0; n < KRegisterImpl::number_of_registers; n++) { 410 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 411 } 412 #endif 413 } 414 } else { 415 if (VM_Version::supports_evex()) { 416 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 417 int base_addr = XSAVE_AREA_UPPERBANK; 418 int off = 0; 419 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 420 for (int n = 16; n < num_xmm_regs; n++) { 421 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 422 } 423 #if COMPILER2_OR_JVMCI 424 base_addr = XSAVE_AREA_OPMASK_BEGIN; 425 off = 0; 426 for (int n = 0; n < KRegisterImpl::number_of_registers; n++) { 427 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 428 } 429 #endif 430 } 431 } 432 433 // Recover CPU state 434 __ pop_CPU_state(); 435 // Get the rbp described implicitly by the calling convention (no oopMap) 436 __ pop(rbp); 437 } 438 439 void RegisterSaver::restore_result_registers(MacroAssembler* masm) { 440 441 // Just restore result register. Only used by deoptimization. By 442 // now any callee save register that needs to be restored to a c2 443 // caller of the deoptee has been extracted into the vframeArray 444 // and will be stuffed into the c2i adapter we create for later 445 // restoration so only result registers need to be restored here. 446 447 // Restore fp result register 448 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes())); 449 // Restore integer result register 450 __ movptr(rax, Address(rsp, rax_offset_in_bytes())); 451 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes())); 452 453 // Pop all of the register save are off the stack except the return address 454 __ addptr(rsp, return_offset_in_bytes()); 455 } 456 457 // Is vector's size (in bytes) bigger than a size saved by default? 458 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions. 459 bool SharedRuntime::is_wide_vector(int size) { 460 return size > 16; 461 } 462 463 // --------------------------------------------------------------------------- 464 // Read the array of BasicTypes from a signature, and compute where the 465 // arguments should go. Values in the VMRegPair regs array refer to 4-byte 466 // quantities. Values less than VMRegImpl::stack0 are registers, those above 467 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer 468 // as framesizes are fixed. 469 // VMRegImpl::stack0 refers to the first slot 0(sp). 470 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher. Register 471 // up to RegisterImpl::number_of_registers) are the 64-bit 472 // integer registers. 473 474 // Note: the INPUTS in sig_bt are in units of Java argument words, which are 475 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit 476 // units regardless of build. Of course for i486 there is no 64 bit build 477 478 // The Java calling convention is a "shifted" version of the C ABI. 479 // By skipping the first C ABI register we can call non-static jni methods 480 // with small numbers of arguments without having to shuffle the arguments 481 // at all. Since we control the java ABI we ought to at least get some 482 // advantage out of it. 483 484 int SharedRuntime::java_calling_convention(const BasicType *sig_bt, 485 VMRegPair *regs, 486 int total_args_passed) { 487 488 // Create the mapping between argument positions and 489 // registers. 490 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = { 491 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5 492 }; 493 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = { 494 j_farg0, j_farg1, j_farg2, j_farg3, 495 j_farg4, j_farg5, j_farg6, j_farg7 496 }; 497 498 499 uint int_args = 0; 500 uint fp_args = 0; 501 uint stk_args = 0; // inc by 2 each time 502 503 for (int i = 0; i < total_args_passed; i++) { 504 switch (sig_bt[i]) { 505 case T_BOOLEAN: 506 case T_CHAR: 507 case T_BYTE: 508 case T_SHORT: 509 case T_INT: 510 if (int_args < Argument::n_int_register_parameters_j) { 511 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 512 } else { 513 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 514 stk_args += 2; 515 } 516 break; 517 case T_VOID: 518 // halves of T_LONG or T_DOUBLE 519 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 520 regs[i].set_bad(); 521 break; 522 case T_LONG: 523 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 524 // fall through 525 case T_OBJECT: 526 case T_ARRAY: 527 case T_ADDRESS: 528 if (int_args < Argument::n_int_register_parameters_j) { 529 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 530 } else { 531 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 532 stk_args += 2; 533 } 534 break; 535 case T_FLOAT: 536 if (fp_args < Argument::n_float_register_parameters_j) { 537 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 538 } else { 539 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 540 stk_args += 2; 541 } 542 break; 543 case T_DOUBLE: 544 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 545 if (fp_args < Argument::n_float_register_parameters_j) { 546 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 547 } else { 548 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 549 stk_args += 2; 550 } 551 break; 552 default: 553 ShouldNotReachHere(); 554 break; 555 } 556 } 557 558 return align_up(stk_args, 2); 559 } 560 561 // Patch the callers callsite with entry to compiled code if it exists. 562 static void patch_callers_callsite(MacroAssembler *masm) { 563 Label L; 564 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD); 565 __ jcc(Assembler::equal, L); 566 567 // Save the current stack pointer 568 __ mov(r13, rsp); 569 // Schedule the branch target address early. 570 // Call into the VM to patch the caller, then jump to compiled callee 571 // rax isn't live so capture return address while we easily can 572 __ movptr(rax, Address(rsp, 0)); 573 574 // align stack so push_CPU_state doesn't fault 575 __ andptr(rsp, -(StackAlignmentInBytes)); 576 __ push_CPU_state(); 577 __ vzeroupper(); 578 // VM needs caller's callsite 579 // VM needs target method 580 // This needs to be a long call since we will relocate this adapter to 581 // the codeBuffer and it may not reach 582 583 // Allocate argument register save area 584 if (frame::arg_reg_save_area_bytes != 0) { 585 __ subptr(rsp, frame::arg_reg_save_area_bytes); 586 } 587 __ mov(c_rarg0, rbx); 588 __ mov(c_rarg1, rax); 589 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite))); 590 591 // De-allocate argument register save area 592 if (frame::arg_reg_save_area_bytes != 0) { 593 __ addptr(rsp, frame::arg_reg_save_area_bytes); 594 } 595 596 __ vzeroupper(); 597 __ pop_CPU_state(); 598 // restore sp 599 __ mov(rsp, r13); 600 __ bind(L); 601 } 602 603 604 static void gen_c2i_adapter(MacroAssembler *masm, 605 int total_args_passed, 606 int comp_args_on_stack, 607 const BasicType *sig_bt, 608 const VMRegPair *regs, 609 Label& skip_fixup) { 610 // Before we get into the guts of the C2I adapter, see if we should be here 611 // at all. We've come from compiled code and are attempting to jump to the 612 // interpreter, which means the caller made a static call to get here 613 // (vcalls always get a compiled target if there is one). Check for a 614 // compiled target. If there is one, we need to patch the caller's call. 615 patch_callers_callsite(masm); 616 617 __ bind(skip_fixup); 618 619 // Since all args are passed on the stack, total_args_passed * 620 // Interpreter::stackElementSize is the space we need. Plus 1 because 621 // we also account for the return address location since 622 // we store it first rather than hold it in rax across all the shuffling 623 624 int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize; 625 626 // stack is aligned, keep it that way 627 extraspace = align_up(extraspace, 2*wordSize); 628 629 // Get return address 630 __ pop(rax); 631 632 // set senderSP value 633 __ mov(r13, rsp); 634 635 __ subptr(rsp, extraspace); 636 637 // Store the return address in the expected location 638 __ movptr(Address(rsp, 0), rax); 639 640 // Now write the args into the outgoing interpreter space 641 for (int i = 0; i < total_args_passed; i++) { 642 if (sig_bt[i] == T_VOID) { 643 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 644 continue; 645 } 646 647 // offset to start parameters 648 int st_off = (total_args_passed - i) * Interpreter::stackElementSize; 649 int next_off = st_off - Interpreter::stackElementSize; 650 651 // Say 4 args: 652 // i st_off 653 // 0 32 T_LONG 654 // 1 24 T_VOID 655 // 2 16 T_OBJECT 656 // 3 8 T_BOOL 657 // - 0 return address 658 // 659 // However to make thing extra confusing. Because we can fit a long/double in 660 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter 661 // leaves one slot empty and only stores to a single slot. In this case the 662 // slot that is occupied is the T_VOID slot. See I said it was confusing. 663 664 VMReg r_1 = regs[i].first(); 665 VMReg r_2 = regs[i].second(); 666 if (!r_1->is_valid()) { 667 assert(!r_2->is_valid(), ""); 668 continue; 669 } 670 if (r_1->is_stack()) { 671 // memory to memory use rax 672 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace; 673 if (!r_2->is_valid()) { 674 // sign extend?? 675 __ movl(rax, Address(rsp, ld_off)); 676 __ movptr(Address(rsp, st_off), rax); 677 678 } else { 679 680 __ movq(rax, Address(rsp, ld_off)); 681 682 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 683 // T_DOUBLE and T_LONG use two slots in the interpreter 684 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 685 // ld_off == LSW, ld_off+wordSize == MSW 686 // st_off == MSW, next_off == LSW 687 __ movq(Address(rsp, next_off), rax); 688 #ifdef ASSERT 689 // Overwrite the unused slot with known junk 690 __ mov64(rax, CONST64(0xdeadffffdeadaaaa)); 691 __ movptr(Address(rsp, st_off), rax); 692 #endif /* ASSERT */ 693 } else { 694 __ movq(Address(rsp, st_off), rax); 695 } 696 } 697 } else if (r_1->is_Register()) { 698 Register r = r_1->as_Register(); 699 if (!r_2->is_valid()) { 700 // must be only an int (or less ) so move only 32bits to slot 701 // why not sign extend?? 702 __ movl(Address(rsp, st_off), r); 703 } else { 704 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 705 // T_DOUBLE and T_LONG use two slots in the interpreter 706 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 707 // long/double in gpr 708 #ifdef ASSERT 709 // Overwrite the unused slot with known junk 710 __ mov64(rax, CONST64(0xdeadffffdeadaaab)); 711 __ movptr(Address(rsp, st_off), rax); 712 #endif /* ASSERT */ 713 __ movq(Address(rsp, next_off), r); 714 } else { 715 __ movptr(Address(rsp, st_off), r); 716 } 717 } 718 } else { 719 assert(r_1->is_XMMRegister(), ""); 720 if (!r_2->is_valid()) { 721 // only a float use just part of the slot 722 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister()); 723 } else { 724 #ifdef ASSERT 725 // Overwrite the unused slot with known junk 726 __ mov64(rax, CONST64(0xdeadffffdeadaaac)); 727 __ movptr(Address(rsp, st_off), rax); 728 #endif /* ASSERT */ 729 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister()); 730 } 731 } 732 } 733 734 // Schedule the branch target address early. 735 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset()))); 736 __ jmp(rcx); 737 } 738 739 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg, 740 address code_start, address code_end, 741 Label& L_ok) { 742 Label L_fail; 743 __ lea(temp_reg, ExternalAddress(code_start)); 744 __ cmpptr(pc_reg, temp_reg); 745 __ jcc(Assembler::belowEqual, L_fail); 746 __ lea(temp_reg, ExternalAddress(code_end)); 747 __ cmpptr(pc_reg, temp_reg); 748 __ jcc(Assembler::below, L_ok); 749 __ bind(L_fail); 750 } 751 752 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm, 753 int total_args_passed, 754 int comp_args_on_stack, 755 const BasicType *sig_bt, 756 const VMRegPair *regs) { 757 758 // Note: r13 contains the senderSP on entry. We must preserve it since 759 // we may do a i2c -> c2i transition if we lose a race where compiled 760 // code goes non-entrant while we get args ready. 761 // In addition we use r13 to locate all the interpreter args as 762 // we must align the stack to 16 bytes on an i2c entry else we 763 // lose alignment we expect in all compiled code and register 764 // save code can segv when fxsave instructions find improperly 765 // aligned stack pointer. 766 767 // Adapters can be frameless because they do not require the caller 768 // to perform additional cleanup work, such as correcting the stack pointer. 769 // An i2c adapter is frameless because the *caller* frame, which is interpreted, 770 // routinely repairs its own stack pointer (from interpreter_frame_last_sp), 771 // even if a callee has modified the stack pointer. 772 // A c2i adapter is frameless because the *callee* frame, which is interpreted, 773 // routinely repairs its caller's stack pointer (from sender_sp, which is set 774 // up via the senderSP register). 775 // In other words, if *either* the caller or callee is interpreted, we can 776 // get the stack pointer repaired after a call. 777 // This is why c2i and i2c adapters cannot be indefinitely composed. 778 // In particular, if a c2i adapter were to somehow call an i2c adapter, 779 // both caller and callee would be compiled methods, and neither would 780 // clean up the stack pointer changes performed by the two adapters. 781 // If this happens, control eventually transfers back to the compiled 782 // caller, but with an uncorrected stack, causing delayed havoc. 783 784 // Pick up the return address 785 __ movptr(rax, Address(rsp, 0)); 786 787 if (VerifyAdapterCalls && 788 (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) { 789 // So, let's test for cascading c2i/i2c adapters right now. 790 // assert(Interpreter::contains($return_addr) || 791 // StubRoutines::contains($return_addr), 792 // "i2c adapter must return to an interpreter frame"); 793 __ block_comment("verify_i2c { "); 794 Label L_ok; 795 if (Interpreter::code() != NULL) 796 range_check(masm, rax, r11, 797 Interpreter::code()->code_start(), Interpreter::code()->code_end(), 798 L_ok); 799 if (StubRoutines::code1() != NULL) 800 range_check(masm, rax, r11, 801 StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(), 802 L_ok); 803 if (StubRoutines::code2() != NULL) 804 range_check(masm, rax, r11, 805 StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(), 806 L_ok); 807 const char* msg = "i2c adapter must return to an interpreter frame"; 808 __ block_comment(msg); 809 __ stop(msg); 810 __ bind(L_ok); 811 __ block_comment("} verify_i2ce "); 812 } 813 814 // Must preserve original SP for loading incoming arguments because 815 // we need to align the outgoing SP for compiled code. 816 __ movptr(r11, rsp); 817 818 // Cut-out for having no stack args. Since up to 2 int/oop args are passed 819 // in registers, we will occasionally have no stack args. 820 int comp_words_on_stack = 0; 821 if (comp_args_on_stack) { 822 // Sig words on the stack are greater-than VMRegImpl::stack0. Those in 823 // registers are below. By subtracting stack0, we either get a negative 824 // number (all values in registers) or the maximum stack slot accessed. 825 826 // Convert 4-byte c2 stack slots to words. 827 comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord; 828 // Round up to miminum stack alignment, in wordSize 829 comp_words_on_stack = align_up(comp_words_on_stack, 2); 830 __ subptr(rsp, comp_words_on_stack * wordSize); 831 } 832 833 834 // Ensure compiled code always sees stack at proper alignment 835 __ andptr(rsp, -16); 836 837 // push the return address and misalign the stack that youngest frame always sees 838 // as far as the placement of the call instruction 839 __ push(rax); 840 841 // Put saved SP in another register 842 const Register saved_sp = rax; 843 __ movptr(saved_sp, r11); 844 845 // Will jump to the compiled code just as if compiled code was doing it. 846 // Pre-load the register-jump target early, to schedule it better. 847 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset()))); 848 849 #if INCLUDE_JVMCI 850 if (EnableJVMCI) { 851 // check if this call should be routed towards a specific entry point 852 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 853 Label no_alternative_target; 854 __ jcc(Assembler::equal, no_alternative_target); 855 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset()))); 856 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 857 __ bind(no_alternative_target); 858 } 859 #endif // INCLUDE_JVMCI 860 861 // Now generate the shuffle code. Pick up all register args and move the 862 // rest through the floating point stack top. 863 for (int i = 0; i < total_args_passed; i++) { 864 if (sig_bt[i] == T_VOID) { 865 // Longs and doubles are passed in native word order, but misaligned 866 // in the 32-bit build. 867 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 868 continue; 869 } 870 871 // Pick up 0, 1 or 2 words from SP+offset. 872 873 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), 874 "scrambled load targets?"); 875 // Load in argument order going down. 876 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize; 877 // Point to interpreter value (vs. tag) 878 int next_off = ld_off - Interpreter::stackElementSize; 879 // 880 // 881 // 882 VMReg r_1 = regs[i].first(); 883 VMReg r_2 = regs[i].second(); 884 if (!r_1->is_valid()) { 885 assert(!r_2->is_valid(), ""); 886 continue; 887 } 888 if (r_1->is_stack()) { 889 // Convert stack slot to an SP offset (+ wordSize to account for return address ) 890 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize; 891 892 // We can use r13 as a temp here because compiled code doesn't need r13 as an input 893 // and if we end up going thru a c2i because of a miss a reasonable value of r13 894 // will be generated. 895 if (!r_2->is_valid()) { 896 // sign extend??? 897 __ movl(r13, Address(saved_sp, ld_off)); 898 __ movptr(Address(rsp, st_off), r13); 899 } else { 900 // 901 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 902 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 903 // So we must adjust where to pick up the data to match the interpreter. 904 // 905 // Interpreter local[n] == MSW, local[n+1] == LSW however locals 906 // are accessed as negative so LSW is at LOW address 907 908 // ld_off is MSW so get LSW 909 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 910 next_off : ld_off; 911 __ movq(r13, Address(saved_sp, offset)); 912 // st_off is LSW (i.e. reg.first()) 913 __ movq(Address(rsp, st_off), r13); 914 } 915 } else if (r_1->is_Register()) { // Register argument 916 Register r = r_1->as_Register(); 917 assert(r != rax, "must be different"); 918 if (r_2->is_valid()) { 919 // 920 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 921 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 922 // So we must adjust where to pick up the data to match the interpreter. 923 924 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 925 next_off : ld_off; 926 927 // this can be a misaligned move 928 __ movq(r, Address(saved_sp, offset)); 929 } else { 930 // sign extend and use a full word? 931 __ movl(r, Address(saved_sp, ld_off)); 932 } 933 } else { 934 if (!r_2->is_valid()) { 935 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off)); 936 } else { 937 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off)); 938 } 939 } 940 } 941 942 // 6243940 We might end up in handle_wrong_method if 943 // the callee is deoptimized as we race thru here. If that 944 // happens we don't want to take a safepoint because the 945 // caller frame will look interpreted and arguments are now 946 // "compiled" so it is much better to make this transition 947 // invisible to the stack walking code. Unfortunately if 948 // we try and find the callee by normal means a safepoint 949 // is possible. So we stash the desired callee in the thread 950 // and the vm will find there should this case occur. 951 952 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx); 953 954 // put Method* where a c2i would expect should we end up there 955 // only needed becaus eof c2 resolve stubs return Method* as a result in 956 // rax 957 __ mov(rax, rbx); 958 __ jmp(r11); 959 } 960 961 // --------------------------------------------------------------- 962 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm, 963 int total_args_passed, 964 int comp_args_on_stack, 965 const BasicType *sig_bt, 966 const VMRegPair *regs, 967 AdapterFingerPrint* fingerprint) { 968 address i2c_entry = __ pc(); 969 970 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs); 971 972 // ------------------------------------------------------------------------- 973 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls 974 // to the interpreter. The args start out packed in the compiled layout. They 975 // need to be unpacked into the interpreter layout. This will almost always 976 // require some stack space. We grow the current (compiled) stack, then repack 977 // the args. We finally end in a jump to the generic interpreter entry point. 978 // On exit from the interpreter, the interpreter will restore our SP (lest the 979 // compiled code, which relys solely on SP and not RBP, get sick). 980 981 address c2i_unverified_entry = __ pc(); 982 Label skip_fixup; 983 Label ok; 984 985 Register holder = rax; 986 Register receiver = j_rarg0; 987 Register temp = rbx; 988 989 { 990 __ load_klass(temp, receiver, rscratch1); 991 __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset())); 992 __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset())); 993 __ jcc(Assembler::equal, ok); 994 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 995 996 __ bind(ok); 997 // Method might have been compiled since the call site was patched to 998 // interpreted if that is the case treat it as a miss so we can get 999 // the call site corrected. 1000 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD); 1001 __ jcc(Assembler::equal, skip_fixup); 1002 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1003 } 1004 1005 address c2i_entry = __ pc(); 1006 1007 // Class initialization barrier for static methods 1008 address c2i_no_clinit_check_entry = NULL; 1009 if (VM_Version::supports_fast_class_init_checks()) { 1010 Label L_skip_barrier; 1011 Register method = rbx; 1012 1013 { // Bypass the barrier for non-static methods 1014 Register flags = rscratch1; 1015 __ movl(flags, Address(method, Method::access_flags_offset())); 1016 __ testl(flags, JVM_ACC_STATIC); 1017 __ jcc(Assembler::zero, L_skip_barrier); // non-static 1018 } 1019 1020 Register klass = rscratch1; 1021 __ load_method_holder(klass, method); 1022 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1023 1024 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1025 1026 __ bind(L_skip_barrier); 1027 c2i_no_clinit_check_entry = __ pc(); 1028 } 1029 1030 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1031 bs->c2i_entry_barrier(masm); 1032 1033 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup); 1034 1035 return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry); 1036 } 1037 1038 int SharedRuntime::c_calling_convention(const BasicType *sig_bt, 1039 VMRegPair *regs, 1040 VMRegPair *regs2, 1041 int total_args_passed) { 1042 assert(regs2 == NULL, "not needed on x86"); 1043 // We return the amount of VMRegImpl stack slots we need to reserve for all 1044 // the arguments NOT counting out_preserve_stack_slots. 1045 1046 // NOTE: These arrays will have to change when c1 is ported 1047 #ifdef _WIN64 1048 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1049 c_rarg0, c_rarg1, c_rarg2, c_rarg3 1050 }; 1051 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1052 c_farg0, c_farg1, c_farg2, c_farg3 1053 }; 1054 #else 1055 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1056 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5 1057 }; 1058 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1059 c_farg0, c_farg1, c_farg2, c_farg3, 1060 c_farg4, c_farg5, c_farg6, c_farg7 1061 }; 1062 #endif // _WIN64 1063 1064 1065 uint int_args = 0; 1066 uint fp_args = 0; 1067 uint stk_args = 0; // inc by 2 each time 1068 1069 for (int i = 0; i < total_args_passed; i++) { 1070 switch (sig_bt[i]) { 1071 case T_BOOLEAN: 1072 case T_CHAR: 1073 case T_BYTE: 1074 case T_SHORT: 1075 case T_INT: 1076 if (int_args < Argument::n_int_register_parameters_c) { 1077 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 1078 #ifdef _WIN64 1079 fp_args++; 1080 // Allocate slots for callee to stuff register args the stack. 1081 stk_args += 2; 1082 #endif 1083 } else { 1084 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1085 stk_args += 2; 1086 } 1087 break; 1088 case T_LONG: 1089 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1090 // fall through 1091 case T_OBJECT: 1092 case T_ARRAY: 1093 case T_ADDRESS: 1094 case T_METADATA: 1095 if (int_args < Argument::n_int_register_parameters_c) { 1096 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 1097 #ifdef _WIN64 1098 fp_args++; 1099 stk_args += 2; 1100 #endif 1101 } else { 1102 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1103 stk_args += 2; 1104 } 1105 break; 1106 case T_FLOAT: 1107 if (fp_args < Argument::n_float_register_parameters_c) { 1108 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 1109 #ifdef _WIN64 1110 int_args++; 1111 // Allocate slots for callee to stuff register args the stack. 1112 stk_args += 2; 1113 #endif 1114 } else { 1115 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1116 stk_args += 2; 1117 } 1118 break; 1119 case T_DOUBLE: 1120 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1121 if (fp_args < Argument::n_float_register_parameters_c) { 1122 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 1123 #ifdef _WIN64 1124 int_args++; 1125 // Allocate slots for callee to stuff register args the stack. 1126 stk_args += 2; 1127 #endif 1128 } else { 1129 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1130 stk_args += 2; 1131 } 1132 break; 1133 case T_VOID: // Halves of longs and doubles 1134 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 1135 regs[i].set_bad(); 1136 break; 1137 default: 1138 ShouldNotReachHere(); 1139 break; 1140 } 1141 } 1142 #ifdef _WIN64 1143 // windows abi requires that we always allocate enough stack space 1144 // for 4 64bit registers to be stored down. 1145 if (stk_args < 8) { 1146 stk_args = 8; 1147 } 1148 #endif // _WIN64 1149 1150 return stk_args; 1151 } 1152 1153 int SharedRuntime::vector_calling_convention(VMRegPair *regs, 1154 uint num_bits, 1155 uint total_args_passed) { 1156 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512, 1157 "only certain vector sizes are supported for now"); 1158 1159 static const XMMRegister VEC_ArgReg[32] = { 1160 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 1161 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1162 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23, 1163 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31 1164 }; 1165 1166 uint stk_args = 0; 1167 uint fp_args = 0; 1168 1169 for (uint i = 0; i < total_args_passed; i++) { 1170 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg(); 1171 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15)); 1172 regs[i].set_pair(vmreg->next(next_val), vmreg); 1173 } 1174 1175 return stk_args; 1176 } 1177 1178 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1179 // We always ignore the frame_slots arg and just use the space just below frame pointer 1180 // which by this time is free to use 1181 switch (ret_type) { 1182 case T_FLOAT: 1183 __ movflt(Address(rbp, -wordSize), xmm0); 1184 break; 1185 case T_DOUBLE: 1186 __ movdbl(Address(rbp, -wordSize), xmm0); 1187 break; 1188 case T_VOID: break; 1189 default: { 1190 __ movptr(Address(rbp, -wordSize), rax); 1191 } 1192 } 1193 } 1194 1195 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1196 // We always ignore the frame_slots arg and just use the space just below frame pointer 1197 // which by this time is free to use 1198 switch (ret_type) { 1199 case T_FLOAT: 1200 __ movflt(xmm0, Address(rbp, -wordSize)); 1201 break; 1202 case T_DOUBLE: 1203 __ movdbl(xmm0, Address(rbp, -wordSize)); 1204 break; 1205 case T_VOID: break; 1206 default: { 1207 __ movptr(rax, Address(rbp, -wordSize)); 1208 } 1209 } 1210 } 1211 1212 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1213 for ( int i = first_arg ; i < arg_count ; i++ ) { 1214 if (args[i].first()->is_Register()) { 1215 __ push(args[i].first()->as_Register()); 1216 } else if (args[i].first()->is_XMMRegister()) { 1217 __ subptr(rsp, 2*wordSize); 1218 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister()); 1219 } 1220 } 1221 } 1222 1223 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1224 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) { 1225 if (args[i].first()->is_Register()) { 1226 __ pop(args[i].first()->as_Register()); 1227 } else if (args[i].first()->is_XMMRegister()) { 1228 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0)); 1229 __ addptr(rsp, 2*wordSize); 1230 } 1231 } 1232 } 1233 1234 // Unpack an array argument into a pointer to the body and the length 1235 // if the array is non-null, otherwise pass 0 for both. 1236 static void unpack_array_argument(MacroAssembler* masm, VMRegPair reg, BasicType in_elem_type, VMRegPair body_arg, VMRegPair length_arg) { 1237 Register tmp_reg = rax; 1238 assert(!body_arg.first()->is_Register() || body_arg.first()->as_Register() != tmp_reg, 1239 "possible collision"); 1240 assert(!length_arg.first()->is_Register() || length_arg.first()->as_Register() != tmp_reg, 1241 "possible collision"); 1242 1243 __ block_comment("unpack_array_argument {"); 1244 1245 // Pass the length, ptr pair 1246 Label is_null, done; 1247 VMRegPair tmp; 1248 tmp.set_ptr(tmp_reg->as_VMReg()); 1249 if (reg.first()->is_stack()) { 1250 // Load the arg up from the stack 1251 __ move_ptr(reg, tmp); 1252 reg = tmp; 1253 } 1254 __ testptr(reg.first()->as_Register(), reg.first()->as_Register()); 1255 __ jccb(Assembler::equal, is_null); 1256 __ lea(tmp_reg, Address(reg.first()->as_Register(), arrayOopDesc::base_offset_in_bytes(in_elem_type))); 1257 __ move_ptr(tmp, body_arg); 1258 // load the length relative to the body. 1259 __ movl(tmp_reg, Address(tmp_reg, arrayOopDesc::length_offset_in_bytes() - 1260 arrayOopDesc::base_offset_in_bytes(in_elem_type))); 1261 __ move32_64(tmp, length_arg); 1262 __ jmpb(done); 1263 __ bind(is_null); 1264 // Pass zeros 1265 __ xorptr(tmp_reg, tmp_reg); 1266 __ move_ptr(tmp, body_arg); 1267 __ move32_64(tmp, length_arg); 1268 __ bind(done); 1269 1270 __ block_comment("} unpack_array_argument"); 1271 } 1272 1273 1274 // Different signatures may require very different orders for the move 1275 // to avoid clobbering other arguments. There's no simple way to 1276 // order them safely. Compute a safe order for issuing stores and 1277 // break any cycles in those stores. This code is fairly general but 1278 // it's not necessary on the other platforms so we keep it in the 1279 // platform dependent code instead of moving it into a shared file. 1280 // (See bugs 7013347 & 7145024.) 1281 // Note that this code is specific to LP64. 1282 class ComputeMoveOrder: public StackObj { 1283 class MoveOperation: public ResourceObj { 1284 friend class ComputeMoveOrder; 1285 private: 1286 VMRegPair _src; 1287 VMRegPair _dst; 1288 int _src_index; 1289 int _dst_index; 1290 bool _processed; 1291 MoveOperation* _next; 1292 MoveOperation* _prev; 1293 1294 static int get_id(VMRegPair r) { 1295 return r.first()->value(); 1296 } 1297 1298 public: 1299 MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst): 1300 _src(src) 1301 , _dst(dst) 1302 , _src_index(src_index) 1303 , _dst_index(dst_index) 1304 , _processed(false) 1305 , _next(NULL) 1306 , _prev(NULL) { 1307 } 1308 1309 VMRegPair src() const { return _src; } 1310 int src_id() const { return get_id(src()); } 1311 int src_index() const { return _src_index; } 1312 VMRegPair dst() const { return _dst; } 1313 void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; } 1314 int dst_index() const { return _dst_index; } 1315 int dst_id() const { return get_id(dst()); } 1316 MoveOperation* next() const { return _next; } 1317 MoveOperation* prev() const { return _prev; } 1318 void set_processed() { _processed = true; } 1319 bool is_processed() const { return _processed; } 1320 1321 // insert 1322 void break_cycle(VMRegPair temp_register) { 1323 // create a new store following the last store 1324 // to move from the temp_register to the original 1325 MoveOperation* new_store = new MoveOperation(-1, temp_register, dst_index(), dst()); 1326 1327 // break the cycle of links and insert new_store at the end 1328 // break the reverse link. 1329 MoveOperation* p = prev(); 1330 assert(p->next() == this, "must be"); 1331 _prev = NULL; 1332 p->_next = new_store; 1333 new_store->_prev = p; 1334 1335 // change the original store to save it's value in the temp. 1336 set_dst(-1, temp_register); 1337 } 1338 1339 void link(GrowableArray<MoveOperation*>& killer) { 1340 // link this store in front the store that it depends on 1341 MoveOperation* n = killer.at_grow(src_id(), NULL); 1342 if (n != NULL) { 1343 assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet"); 1344 _next = n; 1345 n->_prev = this; 1346 } 1347 } 1348 }; 1349 1350 private: 1351 GrowableArray<MoveOperation*> edges; 1352 1353 public: 1354 ComputeMoveOrder(int total_in_args, const VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs, 1355 const BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) { 1356 // Move operations where the dest is the stack can all be 1357 // scheduled first since they can't interfere with the other moves. 1358 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 1359 if (in_sig_bt[i] == T_ARRAY) { 1360 c_arg--; 1361 if (out_regs[c_arg].first()->is_stack() && 1362 out_regs[c_arg + 1].first()->is_stack()) { 1363 arg_order.push(i); 1364 arg_order.push(c_arg); 1365 } else { 1366 if (out_regs[c_arg].first()->is_stack() || 1367 in_regs[i].first() == out_regs[c_arg].first()) { 1368 add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + 1]); 1369 } else { 1370 add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]); 1371 } 1372 } 1373 } else if (in_sig_bt[i] == T_VOID) { 1374 arg_order.push(i); 1375 arg_order.push(c_arg); 1376 } else { 1377 if (out_regs[c_arg].first()->is_stack() || 1378 in_regs[i].first() == out_regs[c_arg].first()) { 1379 arg_order.push(i); 1380 arg_order.push(c_arg); 1381 } else { 1382 add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]); 1383 } 1384 } 1385 } 1386 // Break any cycles in the register moves and emit the in the 1387 // proper order. 1388 GrowableArray<MoveOperation*>* stores = get_store_order(tmp_vmreg); 1389 for (int i = 0; i < stores->length(); i++) { 1390 arg_order.push(stores->at(i)->src_index()); 1391 arg_order.push(stores->at(i)->dst_index()); 1392 } 1393 } 1394 1395 // Collected all the move operations 1396 void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) { 1397 if (src.first() == dst.first()) return; 1398 edges.append(new MoveOperation(src_index, src, dst_index, dst)); 1399 } 1400 1401 // Walk the edges breaking cycles between moves. The result list 1402 // can be walked in order to produce the proper set of loads 1403 GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) { 1404 // Record which moves kill which values 1405 GrowableArray<MoveOperation*> killer; 1406 for (int i = 0; i < edges.length(); i++) { 1407 MoveOperation* s = edges.at(i); 1408 assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer"); 1409 killer.at_put_grow(s->dst_id(), s, NULL); 1410 } 1411 assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL, 1412 "make sure temp isn't in the registers that are killed"); 1413 1414 // create links between loads and stores 1415 for (int i = 0; i < edges.length(); i++) { 1416 edges.at(i)->link(killer); 1417 } 1418 1419 // at this point, all the move operations are chained together 1420 // in a doubly linked list. Processing it backwards finds 1421 // the beginning of the chain, forwards finds the end. If there's 1422 // a cycle it can be broken at any point, so pick an edge and walk 1423 // backward until the list ends or we end where we started. 1424 GrowableArray<MoveOperation*>* stores = new GrowableArray<MoveOperation*>(); 1425 for (int e = 0; e < edges.length(); e++) { 1426 MoveOperation* s = edges.at(e); 1427 if (!s->is_processed()) { 1428 MoveOperation* start = s; 1429 // search for the beginning of the chain or cycle 1430 while (start->prev() != NULL && start->prev() != s) { 1431 start = start->prev(); 1432 } 1433 if (start->prev() == s) { 1434 start->break_cycle(temp_register); 1435 } 1436 // walk the chain forward inserting to store list 1437 while (start != NULL) { 1438 stores->append(start); 1439 start->set_processed(); 1440 start = start->next(); 1441 } 1442 } 1443 } 1444 return stores; 1445 } 1446 }; 1447 1448 static void verify_oop_args(MacroAssembler* masm, 1449 const methodHandle& method, 1450 const BasicType* sig_bt, 1451 const VMRegPair* regs) { 1452 Register temp_reg = rbx; // not part of any compiled calling seq 1453 if (VerifyOops) { 1454 for (int i = 0; i < method->size_of_parameters(); i++) { 1455 if (is_reference_type(sig_bt[i])) { 1456 VMReg r = regs[i].first(); 1457 assert(r->is_valid(), "bad oop arg"); 1458 if (r->is_stack()) { 1459 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1460 __ verify_oop(temp_reg); 1461 } else { 1462 __ verify_oop(r->as_Register()); 1463 } 1464 } 1465 } 1466 } 1467 } 1468 1469 static void gen_special_dispatch(MacroAssembler* masm, 1470 const methodHandle& method, 1471 const BasicType* sig_bt, 1472 const VMRegPair* regs) { 1473 verify_oop_args(masm, method, sig_bt, regs); 1474 vmIntrinsics::ID iid = method->intrinsic_id(); 1475 1476 // Now write the args into the outgoing interpreter space 1477 bool has_receiver = false; 1478 Register receiver_reg = noreg; 1479 int member_arg_pos = -1; 1480 Register member_reg = noreg; 1481 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid); 1482 if (ref_kind != 0) { 1483 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument 1484 member_reg = rbx; // known to be free at this point 1485 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind); 1486 } else if (iid == vmIntrinsics::_invokeBasic || iid == vmIntrinsics::_linkToNative) { 1487 has_receiver = true; 1488 } else { 1489 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid)); 1490 } 1491 1492 if (member_reg != noreg) { 1493 // Load the member_arg into register, if necessary. 1494 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs); 1495 VMReg r = regs[member_arg_pos].first(); 1496 if (r->is_stack()) { 1497 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1498 } else { 1499 // no data motion is needed 1500 member_reg = r->as_Register(); 1501 } 1502 } 1503 1504 if (has_receiver) { 1505 // Make sure the receiver is loaded into a register. 1506 assert(method->size_of_parameters() > 0, "oob"); 1507 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object"); 1508 VMReg r = regs[0].first(); 1509 assert(r->is_valid(), "bad receiver arg"); 1510 if (r->is_stack()) { 1511 // Porting note: This assumes that compiled calling conventions always 1512 // pass the receiver oop in a register. If this is not true on some 1513 // platform, pick a temp and load the receiver from stack. 1514 fatal("receiver always in a register"); 1515 receiver_reg = j_rarg0; // known to be free at this point 1516 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1517 } else { 1518 // no data motion is needed 1519 receiver_reg = r->as_Register(); 1520 } 1521 } 1522 1523 // Figure out which address we are really jumping to: 1524 MethodHandles::generate_method_handle_dispatch(masm, iid, 1525 receiver_reg, member_reg, /*for_compiler_entry:*/ true); 1526 } 1527 1528 // --------------------------------------------------------------------------- 1529 // Generate a native wrapper for a given method. The method takes arguments 1530 // in the Java compiled code convention, marshals them to the native 1531 // convention (handlizes oops, etc), transitions to native, makes the call, 1532 // returns to java state (possibly blocking), unhandlizes any result and 1533 // returns. 1534 // 1535 // Critical native functions are a shorthand for the use of 1536 // GetPrimtiveArrayCritical and disallow the use of any other JNI 1537 // functions. The wrapper is expected to unpack the arguments before 1538 // passing them to the callee. Critical native functions leave the state _in_Java, 1539 // since they cannot stop for GC. 1540 // Some other parts of JNI setup are skipped like the tear down of the JNI handle 1541 // block and the check for pending exceptions it's impossible for them 1542 // to be thrown. 1543 // 1544 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm, 1545 const methodHandle& method, 1546 int compile_id, 1547 BasicType* in_sig_bt, 1548 VMRegPair* in_regs, 1549 BasicType ret_type, 1550 address critical_entry) { 1551 if (method->is_method_handle_intrinsic()) { 1552 vmIntrinsics::ID iid = method->intrinsic_id(); 1553 intptr_t start = (intptr_t)__ pc(); 1554 int vep_offset = ((intptr_t)__ pc()) - start; 1555 gen_special_dispatch(masm, 1556 method, 1557 in_sig_bt, 1558 in_regs); 1559 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period 1560 __ flush(); 1561 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually 1562 return nmethod::new_native_nmethod(method, 1563 compile_id, 1564 masm->code(), 1565 vep_offset, 1566 frame_complete, 1567 stack_slots / VMRegImpl::slots_per_word, 1568 in_ByteSize(-1), 1569 in_ByteSize(-1), 1570 (OopMapSet*)NULL); 1571 } 1572 bool is_critical_native = true; 1573 address native_func = critical_entry; 1574 if (native_func == NULL) { 1575 native_func = method->native_function(); 1576 is_critical_native = false; 1577 } 1578 assert(native_func != NULL, "must have function"); 1579 1580 // An OopMap for lock (and class if static) 1581 OopMapSet *oop_maps = new OopMapSet(); 1582 intptr_t start = (intptr_t)__ pc(); 1583 1584 // We have received a description of where all the java arg are located 1585 // on entry to the wrapper. We need to convert these args to where 1586 // the jni function will expect them. To figure out where they go 1587 // we convert the java signature to a C signature by inserting 1588 // the hidden arguments as arg[0] and possibly arg[1] (static method) 1589 1590 const int total_in_args = method->size_of_parameters(); 1591 int total_c_args = total_in_args; 1592 if (!is_critical_native) { 1593 total_c_args += 1; 1594 if (method->is_static()) { 1595 total_c_args++; 1596 } 1597 } else { 1598 for (int i = 0; i < total_in_args; i++) { 1599 if (in_sig_bt[i] == T_ARRAY) { 1600 total_c_args++; 1601 } 1602 } 1603 } 1604 1605 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args); 1606 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args); 1607 BasicType* in_elem_bt = NULL; 1608 1609 int argc = 0; 1610 if (!is_critical_native) { 1611 out_sig_bt[argc++] = T_ADDRESS; 1612 if (method->is_static()) { 1613 out_sig_bt[argc++] = T_OBJECT; 1614 } 1615 1616 for (int i = 0; i < total_in_args ; i++ ) { 1617 out_sig_bt[argc++] = in_sig_bt[i]; 1618 } 1619 } else { 1620 in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_in_args); 1621 SignatureStream ss(method->signature()); 1622 for (int i = 0; i < total_in_args ; i++ ) { 1623 if (in_sig_bt[i] == T_ARRAY) { 1624 // Arrays are passed as int, elem* pair 1625 out_sig_bt[argc++] = T_INT; 1626 out_sig_bt[argc++] = T_ADDRESS; 1627 ss.skip_array_prefix(1); // skip one '[' 1628 assert(ss.is_primitive(), "primitive type expected"); 1629 in_elem_bt[i] = ss.type(); 1630 } else { 1631 out_sig_bt[argc++] = in_sig_bt[i]; 1632 in_elem_bt[i] = T_VOID; 1633 } 1634 if (in_sig_bt[i] != T_VOID) { 1635 assert(in_sig_bt[i] == ss.type() || 1636 in_sig_bt[i] == T_ARRAY, "must match"); 1637 ss.next(); 1638 } 1639 } 1640 } 1641 1642 // Now figure out where the args must be stored and how much stack space 1643 // they require. 1644 int out_arg_slots; 1645 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args); 1646 1647 // Compute framesize for the wrapper. We need to handlize all oops in 1648 // incoming registers 1649 1650 // Calculate the total number of stack slots we will need. 1651 1652 // First count the abi requirement plus all of the outgoing args 1653 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots; 1654 1655 // Now the space for the inbound oop handle area 1656 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers 1657 if (is_critical_native) { 1658 // Critical natives may have to call out so they need a save area 1659 // for register arguments. 1660 int double_slots = 0; 1661 int single_slots = 0; 1662 for ( int i = 0; i < total_in_args; i++) { 1663 if (in_regs[i].first()->is_Register()) { 1664 const Register reg = in_regs[i].first()->as_Register(); 1665 switch (in_sig_bt[i]) { 1666 case T_BOOLEAN: 1667 case T_BYTE: 1668 case T_SHORT: 1669 case T_CHAR: 1670 case T_INT: single_slots++; break; 1671 case T_ARRAY: // specific to LP64 (7145024) 1672 case T_LONG: double_slots++; break; 1673 default: ShouldNotReachHere(); 1674 } 1675 } else if (in_regs[i].first()->is_XMMRegister()) { 1676 switch (in_sig_bt[i]) { 1677 case T_FLOAT: single_slots++; break; 1678 case T_DOUBLE: double_slots++; break; 1679 default: ShouldNotReachHere(); 1680 } 1681 } else if (in_regs[i].first()->is_FloatRegister()) { 1682 ShouldNotReachHere(); 1683 } 1684 } 1685 total_save_slots = double_slots * 2 + single_slots; 1686 // align the save area 1687 if (double_slots != 0) { 1688 stack_slots = align_up(stack_slots, 2); 1689 } 1690 } 1691 1692 int oop_handle_offset = stack_slots; 1693 stack_slots += total_save_slots; 1694 1695 // Now any space we need for handlizing a klass if static method 1696 1697 int klass_slot_offset = 0; 1698 int klass_offset = -1; 1699 int lock_slot_offset = 0; 1700 bool is_static = false; 1701 1702 if (method->is_static()) { 1703 klass_slot_offset = stack_slots; 1704 stack_slots += VMRegImpl::slots_per_word; 1705 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size; 1706 is_static = true; 1707 } 1708 1709 // Plus a lock if needed 1710 1711 if (method->is_synchronized()) { 1712 lock_slot_offset = stack_slots; 1713 stack_slots += VMRegImpl::slots_per_word; 1714 } 1715 1716 // Now a place (+2) to save return values or temp during shuffling 1717 // + 4 for return address (which we own) and saved rbp 1718 stack_slots += 6; 1719 1720 // Ok The space we have allocated will look like: 1721 // 1722 // 1723 // FP-> | | 1724 // |---------------------| 1725 // | 2 slots for moves | 1726 // |---------------------| 1727 // | lock box (if sync) | 1728 // |---------------------| <- lock_slot_offset 1729 // | klass (if static) | 1730 // |---------------------| <- klass_slot_offset 1731 // | oopHandle area | 1732 // |---------------------| <- oop_handle_offset (6 java arg registers) 1733 // | outbound memory | 1734 // | based arguments | 1735 // | | 1736 // |---------------------| 1737 // | | 1738 // SP-> | out_preserved_slots | 1739 // 1740 // 1741 1742 1743 // Now compute actual number of stack words we need rounding to make 1744 // stack properly aligned. 1745 stack_slots = align_up(stack_slots, StackAlignmentInSlots); 1746 1747 int stack_size = stack_slots * VMRegImpl::stack_slot_size; 1748 1749 // First thing make an ic check to see if we should even be here 1750 1751 // We are free to use all registers as temps without saving them and 1752 // restoring them except rbp. rbp is the only callee save register 1753 // as far as the interpreter and the compiler(s) are concerned. 1754 1755 1756 const Register ic_reg = rax; 1757 const Register receiver = j_rarg0; 1758 1759 Label hit; 1760 Label exception_pending; 1761 1762 assert_different_registers(ic_reg, receiver, rscratch1); 1763 __ verify_oop(receiver); 1764 __ load_klass(rscratch1, receiver, rscratch2); 1765 __ cmpq(ic_reg, rscratch1); 1766 __ jcc(Assembler::equal, hit); 1767 1768 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1769 1770 // Verified entry point must be aligned 1771 __ align(8); 1772 1773 __ bind(hit); 1774 1775 int vep_offset = ((intptr_t)__ pc()) - start; 1776 1777 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) { 1778 Label L_skip_barrier; 1779 Register klass = r10; 1780 __ mov_metadata(klass, method->method_holder()); // InstanceKlass* 1781 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1782 1783 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1784 1785 __ bind(L_skip_barrier); 1786 } 1787 1788 #ifdef COMPILER1 1789 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available. 1790 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) { 1791 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/); 1792 } 1793 #endif // COMPILER1 1794 1795 // The instruction at the verified entry point must be 5 bytes or longer 1796 // because it can be patched on the fly by make_non_entrant. The stack bang 1797 // instruction fits that requirement. 1798 1799 // Generate stack overflow check 1800 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size()); 1801 1802 // Generate a new frame for the wrapper. 1803 __ enter(); 1804 // -2 because return address is already present and so is saved rbp 1805 __ subptr(rsp, stack_size - 2*wordSize); 1806 1807 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1808 bs->nmethod_entry_barrier(masm); 1809 1810 // Frame is now completed as far as size and linkage. 1811 int frame_complete = ((intptr_t)__ pc()) - start; 1812 1813 if (UseRTMLocking) { 1814 // Abort RTM transaction before calling JNI 1815 // because critical section will be large and will be 1816 // aborted anyway. Also nmethod could be deoptimized. 1817 __ xabort(0); 1818 } 1819 1820 #ifdef ASSERT 1821 { 1822 Label L; 1823 __ mov(rax, rsp); 1824 __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI) 1825 __ cmpptr(rax, rsp); 1826 __ jcc(Assembler::equal, L); 1827 __ stop("improperly aligned stack"); 1828 __ bind(L); 1829 } 1830 #endif /* ASSERT */ 1831 1832 1833 // We use r14 as the oop handle for the receiver/klass 1834 // It is callee save so it survives the call to native 1835 1836 const Register oop_handle_reg = r14; 1837 1838 // 1839 // We immediately shuffle the arguments so that any vm call we have to 1840 // make from here on out (sync slow path, jvmti, etc.) we will have 1841 // captured the oops from our caller and have a valid oopMap for 1842 // them. 1843 1844 // ----------------- 1845 // The Grand Shuffle 1846 1847 // The Java calling convention is either equal (linux) or denser (win64) than the 1848 // c calling convention. However the because of the jni_env argument the c calling 1849 // convention always has at least one more (and two for static) arguments than Java. 1850 // Therefore if we move the args from java -> c backwards then we will never have 1851 // a register->register conflict and we don't have to build a dependency graph 1852 // and figure out how to break any cycles. 1853 // 1854 1855 // Record esp-based slot for receiver on stack for non-static methods 1856 int receiver_offset = -1; 1857 1858 // This is a trick. We double the stack slots so we can claim 1859 // the oops in the caller's frame. Since we are sure to have 1860 // more args than the caller doubling is enough to make 1861 // sure we can capture all the incoming oop args from the 1862 // caller. 1863 // 1864 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/); 1865 1866 // Mark location of rbp (someday) 1867 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp)); 1868 1869 // Use eax, ebx as temporaries during any memory-memory moves we have to do 1870 // All inbound args are referenced based on rbp and all outbound args via rsp. 1871 1872 1873 #ifdef ASSERT 1874 bool reg_destroyed[RegisterImpl::number_of_registers]; 1875 bool freg_destroyed[XMMRegisterImpl::number_of_registers]; 1876 for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) { 1877 reg_destroyed[r] = false; 1878 } 1879 for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) { 1880 freg_destroyed[f] = false; 1881 } 1882 1883 #endif /* ASSERT */ 1884 1885 // This may iterate in two different directions depending on the 1886 // kind of native it is. The reason is that for regular JNI natives 1887 // the incoming and outgoing registers are offset upwards and for 1888 // critical natives they are offset down. 1889 GrowableArray<int> arg_order(2 * total_in_args); 1890 1891 VMRegPair tmp_vmreg; 1892 tmp_vmreg.set2(rbx->as_VMReg()); 1893 1894 if (!is_critical_native) { 1895 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 1896 arg_order.push(i); 1897 arg_order.push(c_arg); 1898 } 1899 } else { 1900 // Compute a valid move order, using tmp_vmreg to break any cycles 1901 ComputeMoveOrder cmo(total_in_args, in_regs, total_c_args, out_regs, in_sig_bt, arg_order, tmp_vmreg); 1902 } 1903 1904 int temploc = -1; 1905 for (int ai = 0; ai < arg_order.length(); ai += 2) { 1906 int i = arg_order.at(ai); 1907 int c_arg = arg_order.at(ai + 1); 1908 __ block_comment(err_msg("move %d -> %d", i, c_arg)); 1909 if (c_arg == -1) { 1910 assert(is_critical_native, "should only be required for critical natives"); 1911 // This arg needs to be moved to a temporary 1912 __ mov(tmp_vmreg.first()->as_Register(), in_regs[i].first()->as_Register()); 1913 in_regs[i] = tmp_vmreg; 1914 temploc = i; 1915 continue; 1916 } else if (i == -1) { 1917 assert(is_critical_native, "should only be required for critical natives"); 1918 // Read from the temporary location 1919 assert(temploc != -1, "must be valid"); 1920 i = temploc; 1921 temploc = -1; 1922 } 1923 #ifdef ASSERT 1924 if (in_regs[i].first()->is_Register()) { 1925 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!"); 1926 } else if (in_regs[i].first()->is_XMMRegister()) { 1927 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!"); 1928 } 1929 if (out_regs[c_arg].first()->is_Register()) { 1930 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 1931 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 1932 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 1933 } 1934 #endif /* ASSERT */ 1935 switch (in_sig_bt[i]) { 1936 case T_ARRAY: 1937 if (is_critical_native) { 1938 unpack_array_argument(masm, in_regs[i], in_elem_bt[i], out_regs[c_arg + 1], out_regs[c_arg]); 1939 c_arg++; 1940 #ifdef ASSERT 1941 if (out_regs[c_arg].first()->is_Register()) { 1942 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 1943 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 1944 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 1945 } 1946 #endif 1947 break; 1948 } 1949 case T_OBJECT: 1950 assert(!is_critical_native, "no oop arguments"); 1951 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg], 1952 ((i == 0) && (!is_static)), 1953 &receiver_offset); 1954 break; 1955 case T_VOID: 1956 break; 1957 1958 case T_FLOAT: 1959 __ float_move(in_regs[i], out_regs[c_arg]); 1960 break; 1961 1962 case T_DOUBLE: 1963 assert( i + 1 < total_in_args && 1964 in_sig_bt[i + 1] == T_VOID && 1965 out_sig_bt[c_arg+1] == T_VOID, "bad arg list"); 1966 __ double_move(in_regs[i], out_regs[c_arg]); 1967 break; 1968 1969 case T_LONG : 1970 __ long_move(in_regs[i], out_regs[c_arg]); 1971 break; 1972 1973 case T_ADDRESS: assert(false, "found T_ADDRESS in java args"); 1974 1975 default: 1976 __ move32_64(in_regs[i], out_regs[c_arg]); 1977 } 1978 } 1979 1980 int c_arg; 1981 1982 // Pre-load a static method's oop into r14. Used both by locking code and 1983 // the normal JNI call code. 1984 if (!is_critical_native) { 1985 // point c_arg at the first arg that is already loaded in case we 1986 // need to spill before we call out 1987 c_arg = total_c_args - total_in_args; 1988 1989 if (method->is_static()) { 1990 1991 // load oop into a register 1992 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror())); 1993 1994 // Now handlize the static class mirror it's known not-null. 1995 __ movptr(Address(rsp, klass_offset), oop_handle_reg); 1996 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset)); 1997 1998 // Now get the handle 1999 __ lea(oop_handle_reg, Address(rsp, klass_offset)); 2000 // store the klass handle as second argument 2001 __ movptr(c_rarg1, oop_handle_reg); 2002 // and protect the arg if we must spill 2003 c_arg--; 2004 } 2005 } else { 2006 // For JNI critical methods we need to save all registers in save_args. 2007 c_arg = 0; 2008 } 2009 2010 // Change state to native (we save the return address in the thread, since it might not 2011 // be pushed on the stack when we do a a stack traversal). It is enough that the pc() 2012 // points into the right code segment. It does not have to be the correct return pc. 2013 // We use the same pc/oopMap repeatedly when we call out 2014 2015 intptr_t the_pc = (intptr_t) __ pc(); 2016 oop_maps->add_gc_map(the_pc - start, map); 2017 2018 __ set_last_Java_frame(rsp, noreg, (address)the_pc); 2019 2020 2021 // We have all of the arguments setup at this point. We must not touch any register 2022 // argument registers at this point (what if we save/restore them there are no oop? 2023 2024 { 2025 SkipIfEqual skip(masm, &DTraceMethodProbes, false); 2026 // protect the args we've loaded 2027 save_args(masm, total_c_args, c_arg, out_regs); 2028 __ mov_metadata(c_rarg1, method()); 2029 __ call_VM_leaf( 2030 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry), 2031 r15_thread, c_rarg1); 2032 restore_args(masm, total_c_args, c_arg, out_regs); 2033 } 2034 2035 // RedefineClasses() tracing support for obsolete method entry 2036 if (log_is_enabled(Trace, redefine, class, obsolete)) { 2037 // protect the args we've loaded 2038 save_args(masm, total_c_args, c_arg, out_regs); 2039 __ mov_metadata(c_rarg1, method()); 2040 __ call_VM_leaf( 2041 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry), 2042 r15_thread, c_rarg1); 2043 restore_args(masm, total_c_args, c_arg, out_regs); 2044 } 2045 2046 // Lock a synchronized method 2047 2048 // Register definitions used by locking and unlocking 2049 2050 const Register swap_reg = rax; // Must use rax for cmpxchg instruction 2051 const Register obj_reg = rbx; // Will contain the oop 2052 const Register lock_reg = r13; // Address of compiler lock object (BasicLock) 2053 const Register old_hdr = r13; // value of old header at unlock time 2054 2055 Label slow_path_lock; 2056 Label lock_done; 2057 2058 if (method->is_synchronized()) { 2059 assert(!is_critical_native, "unhandled"); 2060 2061 2062 const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes(); 2063 2064 // Get the handle (the 2nd argument) 2065 __ mov(oop_handle_reg, c_rarg1); 2066 2067 // Get address of the box 2068 2069 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2070 2071 // Load the oop from the handle 2072 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2073 2074 if (UseBiasedLocking) { 2075 __ biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch1, rscratch2, false, lock_done, &slow_path_lock); 2076 } 2077 2078 // Load immediate 1 into swap_reg %rax 2079 __ movl(swap_reg, 1); 2080 2081 // Load (object->mark() | 1) into swap_reg %rax 2082 __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2083 2084 // Save (object->mark() | 1) into BasicLock's displaced header 2085 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2086 2087 // src -> dest iff dest == rax else rax <- dest 2088 __ lock(); 2089 __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2090 __ jcc(Assembler::equal, lock_done); 2091 2092 // Hmm should this move to the slow path code area??? 2093 2094 // Test if the oopMark is an obvious stack pointer, i.e., 2095 // 1) (mark & 3) == 0, and 2096 // 2) rsp <= mark < mark + os::pagesize() 2097 // These 3 tests can be done by evaluating the following 2098 // expression: ((mark - rsp) & (3 - os::vm_page_size())), 2099 // assuming both stack pointer and pagesize have their 2100 // least significant 2 bits clear. 2101 // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg 2102 2103 __ subptr(swap_reg, rsp); 2104 __ andptr(swap_reg, 3 - os::vm_page_size()); 2105 2106 // Save the test result, for recursive case, the result is zero 2107 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2108 __ jcc(Assembler::notEqual, slow_path_lock); 2109 2110 // Slow path will re-enter here 2111 2112 __ bind(lock_done); 2113 } 2114 2115 // Finally just about ready to make the JNI call 2116 2117 // get JNIEnv* which is first argument to native 2118 if (!is_critical_native) { 2119 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset()))); 2120 2121 // Now set thread in native 2122 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 2123 } 2124 2125 __ call(RuntimeAddress(native_func)); 2126 2127 // Verify or restore cpu control state after JNI call 2128 __ restore_cpu_control_state_after_jni(); 2129 2130 // Unpack native results. 2131 switch (ret_type) { 2132 case T_BOOLEAN: __ c2bool(rax); break; 2133 case T_CHAR : __ movzwl(rax, rax); break; 2134 case T_BYTE : __ sign_extend_byte (rax); break; 2135 case T_SHORT : __ sign_extend_short(rax); break; 2136 case T_INT : /* nothing to do */ break; 2137 case T_DOUBLE : 2138 case T_FLOAT : 2139 // Result is in xmm0 we'll save as needed 2140 break; 2141 case T_ARRAY: // Really a handle 2142 case T_OBJECT: // Really a handle 2143 break; // can't de-handlize until after safepoint check 2144 case T_VOID: break; 2145 case T_LONG: break; 2146 default : ShouldNotReachHere(); 2147 } 2148 2149 Label after_transition; 2150 2151 // If this is a critical native, check for a safepoint or suspend request after the call. 2152 // If a safepoint is needed, transition to native, then to native_trans to handle 2153 // safepoints like the native methods that are not critical natives. 2154 if (is_critical_native) { 2155 Label needs_safepoint; 2156 __ safepoint_poll(needs_safepoint, r15_thread, false /* at_return */, false /* in_nmethod */); 2157 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2158 __ jcc(Assembler::equal, after_transition); 2159 __ bind(needs_safepoint); 2160 } 2161 2162 // Switch thread to "native transition" state before reading the synchronization state. 2163 // This additional state is necessary because reading and testing the synchronization 2164 // state is not atomic w.r.t. GC, as this scenario demonstrates: 2165 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted. 2166 // VM thread changes sync state to synchronizing and suspends threads for GC. 2167 // Thread A is resumed to finish this native method, but doesn't block here since it 2168 // didn't see any synchronization is progress, and escapes. 2169 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 2170 2171 // Force this write out before the read below 2172 __ membar(Assembler::Membar_mask_bits( 2173 Assembler::LoadLoad | Assembler::LoadStore | 2174 Assembler::StoreLoad | Assembler::StoreStore)); 2175 2176 // check for safepoint operation in progress and/or pending suspend requests 2177 { 2178 Label Continue; 2179 Label slow_path; 2180 2181 __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */); 2182 2183 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2184 __ jcc(Assembler::equal, Continue); 2185 __ bind(slow_path); 2186 2187 // Don't use call_VM as it will see a possible pending exception and forward it 2188 // and never return here preventing us from clearing _last_native_pc down below. 2189 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are 2190 // preserved and correspond to the bcp/locals pointers. So we do a runtime call 2191 // by hand. 2192 // 2193 __ vzeroupper(); 2194 save_native_result(masm, ret_type, stack_slots); 2195 __ mov(c_rarg0, r15_thread); 2196 __ mov(r12, rsp); // remember sp 2197 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2198 __ andptr(rsp, -16); // align stack as required by ABI 2199 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 2200 __ mov(rsp, r12); // restore sp 2201 __ reinit_heapbase(); 2202 // Restore any method result value 2203 restore_native_result(masm, ret_type, stack_slots); 2204 __ bind(Continue); 2205 } 2206 2207 // change thread state 2208 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 2209 __ bind(after_transition); 2210 2211 Label reguard; 2212 Label reguard_done; 2213 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 2214 __ jcc(Assembler::equal, reguard); 2215 __ bind(reguard_done); 2216 2217 // native result if any is live 2218 2219 // Unlock 2220 Label unlock_done; 2221 Label slow_path_unlock; 2222 if (method->is_synchronized()) { 2223 2224 // Get locked oop from the handle we passed to jni 2225 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2226 2227 Label done; 2228 2229 if (UseBiasedLocking) { 2230 __ biased_locking_exit(obj_reg, old_hdr, done); 2231 } 2232 2233 // Simple recursive lock? 2234 2235 __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD); 2236 __ jcc(Assembler::equal, done); 2237 2238 // Must save rax if if it is live now because cmpxchg must use it 2239 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2240 save_native_result(masm, ret_type, stack_slots); 2241 } 2242 2243 2244 // get address of the stack lock 2245 __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2246 // get old displaced header 2247 __ movptr(old_hdr, Address(rax, 0)); 2248 2249 // Atomic swap old header if oop still contains the stack lock 2250 __ lock(); 2251 __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2252 __ jcc(Assembler::notEqual, slow_path_unlock); 2253 2254 // slow path re-enters here 2255 __ bind(unlock_done); 2256 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2257 restore_native_result(masm, ret_type, stack_slots); 2258 } 2259 2260 __ bind(done); 2261 2262 } 2263 { 2264 SkipIfEqual skip(masm, &DTraceMethodProbes, false); 2265 save_native_result(masm, ret_type, stack_slots); 2266 __ mov_metadata(c_rarg1, method()); 2267 __ call_VM_leaf( 2268 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit), 2269 r15_thread, c_rarg1); 2270 restore_native_result(masm, ret_type, stack_slots); 2271 } 2272 2273 __ reset_last_Java_frame(false); 2274 2275 // Unbox oop result, e.g. JNIHandles::resolve value. 2276 if (is_reference_type(ret_type)) { 2277 __ resolve_jobject(rax /* value */, 2278 r15_thread /* thread */, 2279 rcx /* tmp */); 2280 } 2281 2282 if (CheckJNICalls) { 2283 // clear_pending_jni_exception_check 2284 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD); 2285 } 2286 2287 if (!is_critical_native) { 2288 // reset handle block 2289 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset())); 2290 __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD); 2291 } 2292 2293 // pop our frame 2294 2295 __ leave(); 2296 2297 if (!is_critical_native) { 2298 // Any exception pending? 2299 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD); 2300 __ jcc(Assembler::notEqual, exception_pending); 2301 } 2302 2303 // Return 2304 2305 __ ret(0); 2306 2307 // Unexpected paths are out of line and go here 2308 2309 if (!is_critical_native) { 2310 // forward the exception 2311 __ bind(exception_pending); 2312 2313 // and forward the exception 2314 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 2315 } 2316 2317 // Slow path locking & unlocking 2318 if (method->is_synchronized()) { 2319 2320 // BEGIN Slow path lock 2321 __ bind(slow_path_lock); 2322 2323 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM 2324 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2325 2326 // protect the args we've loaded 2327 save_args(masm, total_c_args, c_arg, out_regs); 2328 2329 __ mov(c_rarg0, obj_reg); 2330 __ mov(c_rarg1, lock_reg); 2331 __ mov(c_rarg2, r15_thread); 2332 2333 // Not a leaf but we have last_Java_frame setup as we want 2334 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3); 2335 restore_args(masm, total_c_args, c_arg, out_regs); 2336 2337 #ifdef ASSERT 2338 { Label L; 2339 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD); 2340 __ jcc(Assembler::equal, L); 2341 __ stop("no pending exception allowed on exit from monitorenter"); 2342 __ bind(L); 2343 } 2344 #endif 2345 __ jmp(lock_done); 2346 2347 // END Slow path lock 2348 2349 // BEGIN Slow path unlock 2350 __ bind(slow_path_unlock); 2351 2352 // If we haven't already saved the native result we must save it now as xmm registers 2353 // are still exposed. 2354 __ vzeroupper(); 2355 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2356 save_native_result(masm, ret_type, stack_slots); 2357 } 2358 2359 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2360 2361 __ mov(c_rarg0, obj_reg); 2362 __ mov(c_rarg2, r15_thread); 2363 __ mov(r12, rsp); // remember sp 2364 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2365 __ andptr(rsp, -16); // align stack as required by ABI 2366 2367 // Save pending exception around call to VM (which contains an EXCEPTION_MARK) 2368 // NOTE that obj_reg == rbx currently 2369 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset()))); 2370 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD); 2371 2372 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2373 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C))); 2374 __ mov(rsp, r12); // restore sp 2375 __ reinit_heapbase(); 2376 #ifdef ASSERT 2377 { 2378 Label L; 2379 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD); 2380 __ jcc(Assembler::equal, L); 2381 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C"); 2382 __ bind(L); 2383 } 2384 #endif /* ASSERT */ 2385 2386 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx); 2387 2388 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2389 restore_native_result(masm, ret_type, stack_slots); 2390 } 2391 __ jmp(unlock_done); 2392 2393 // END Slow path unlock 2394 2395 } // synchronized 2396 2397 // SLOW PATH Reguard the stack if needed 2398 2399 __ bind(reguard); 2400 __ vzeroupper(); 2401 save_native_result(masm, ret_type, stack_slots); 2402 __ mov(r12, rsp); // remember sp 2403 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2404 __ andptr(rsp, -16); // align stack as required by ABI 2405 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 2406 __ mov(rsp, r12); // restore sp 2407 __ reinit_heapbase(); 2408 restore_native_result(masm, ret_type, stack_slots); 2409 // and continue 2410 __ jmp(reguard_done); 2411 2412 2413 2414 __ flush(); 2415 2416 nmethod *nm = nmethod::new_native_nmethod(method, 2417 compile_id, 2418 masm->code(), 2419 vep_offset, 2420 frame_complete, 2421 stack_slots / VMRegImpl::slots_per_word, 2422 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)), 2423 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size), 2424 oop_maps); 2425 2426 return nm; 2427 } 2428 2429 // this function returns the adjust size (in number of words) to a c2i adapter 2430 // activation for use during deoptimization 2431 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) { 2432 return (callee_locals - callee_parameters) * Interpreter::stackElementWords; 2433 } 2434 2435 2436 uint SharedRuntime::out_preserve_stack_slots() { 2437 return 0; 2438 } 2439 2440 2441 // Number of stack slots between incoming argument block and the start of 2442 // a new frame. The PROLOG must add this many slots to the stack. The 2443 // EPILOG must remove this many slots. amd64 needs two slots for 2444 // return address. 2445 uint SharedRuntime::in_preserve_stack_slots() { 2446 return 4 + 2 * VerifyStackAtCalls; 2447 } 2448 2449 //------------------------------generate_deopt_blob---------------------------- 2450 void SharedRuntime::generate_deopt_blob() { 2451 // Allocate space for the code 2452 ResourceMark rm; 2453 // Setup code generation tools 2454 int pad = 0; 2455 if (UseAVX > 2) { 2456 pad += 1024; 2457 } 2458 #if INCLUDE_JVMCI 2459 if (EnableJVMCI) { 2460 pad += 512; // Increase the buffer size when compiling for JVMCI 2461 } 2462 #endif 2463 CodeBuffer buffer("deopt_blob", 2560+pad, 1024); 2464 MacroAssembler* masm = new MacroAssembler(&buffer); 2465 int frame_size_in_words; 2466 OopMap* map = NULL; 2467 OopMapSet *oop_maps = new OopMapSet(); 2468 2469 // ------------- 2470 // This code enters when returning to a de-optimized nmethod. A return 2471 // address has been pushed on the the stack, and return values are in 2472 // registers. 2473 // If we are doing a normal deopt then we were called from the patched 2474 // nmethod from the point we returned to the nmethod. So the return 2475 // address on the stack is wrong by NativeCall::instruction_size 2476 // We will adjust the value so it looks like we have the original return 2477 // address on the stack (like when we eagerly deoptimized). 2478 // In the case of an exception pending when deoptimizing, we enter 2479 // with a return address on the stack that points after the call we patched 2480 // into the exception handler. We have the following register state from, 2481 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp). 2482 // rax: exception oop 2483 // rbx: exception handler 2484 // rdx: throwing pc 2485 // So in this case we simply jam rdx into the useless return address and 2486 // the stack looks just like we want. 2487 // 2488 // At this point we need to de-opt. We save the argument return 2489 // registers. We call the first C routine, fetch_unroll_info(). This 2490 // routine captures the return values and returns a structure which 2491 // describes the current frame size and the sizes of all replacement frames. 2492 // The current frame is compiled code and may contain many inlined 2493 // functions, each with their own JVM state. We pop the current frame, then 2494 // push all the new frames. Then we call the C routine unpack_frames() to 2495 // populate these frames. Finally unpack_frames() returns us the new target 2496 // address. Notice that callee-save registers are BLOWN here; they have 2497 // already been captured in the vframeArray at the time the return PC was 2498 // patched. 2499 address start = __ pc(); 2500 Label cont; 2501 2502 // Prolog for non exception case! 2503 2504 // Save everything in sight. 2505 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2506 2507 // Normal deoptimization. Save exec mode for unpack_frames. 2508 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved 2509 __ jmp(cont); 2510 2511 int reexecute_offset = __ pc() - start; 2512 #if INCLUDE_JVMCI && !defined(COMPILER1) 2513 if (EnableJVMCI && UseJVMCICompiler) { 2514 // JVMCI does not use this kind of deoptimization 2515 __ should_not_reach_here(); 2516 } 2517 #endif 2518 2519 // Reexecute case 2520 // return address is the pc describes what bci to do re-execute at 2521 2522 // No need to update map as each call to save_live_registers will produce identical oopmap 2523 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2524 2525 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved 2526 __ jmp(cont); 2527 2528 #if INCLUDE_JVMCI 2529 Label after_fetch_unroll_info_call; 2530 int implicit_exception_uncommon_trap_offset = 0; 2531 int uncommon_trap_offset = 0; 2532 2533 if (EnableJVMCI) { 2534 implicit_exception_uncommon_trap_offset = __ pc() - start; 2535 2536 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset()))); 2537 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), (int32_t)NULL_WORD); 2538 2539 uncommon_trap_offset = __ pc() - start; 2540 2541 // Save everything in sight. 2542 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2543 // fetch_unroll_info needs to call last_java_frame() 2544 __ set_last_Java_frame(noreg, noreg, NULL); 2545 2546 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset()))); 2547 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1); 2548 2549 __ movl(r14, (int32_t)Deoptimization::Unpack_reexecute); 2550 __ mov(c_rarg0, r15_thread); 2551 __ movl(c_rarg2, r14); // exec mode 2552 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2553 oop_maps->add_gc_map( __ pc()-start, map->deep_copy()); 2554 2555 __ reset_last_Java_frame(false); 2556 2557 __ jmp(after_fetch_unroll_info_call); 2558 } // EnableJVMCI 2559 #endif // INCLUDE_JVMCI 2560 2561 int exception_offset = __ pc() - start; 2562 2563 // Prolog for exception case 2564 2565 // all registers are dead at this entry point, except for rax, and 2566 // rdx which contain the exception oop and exception pc 2567 // respectively. Set them in TLS and fall thru to the 2568 // unpack_with_exception_in_tls entry point. 2569 2570 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 2571 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax); 2572 2573 int exception_in_tls_offset = __ pc() - start; 2574 2575 // new implementation because exception oop is now passed in JavaThread 2576 2577 // Prolog for exception case 2578 // All registers must be preserved because they might be used by LinearScan 2579 // Exceptiop oop and throwing PC are passed in JavaThread 2580 // tos: stack at point of call to method that threw the exception (i.e. only 2581 // args are on the stack, no return address) 2582 2583 // make room on stack for the return address 2584 // It will be patched later with the throwing pc. The correct value is not 2585 // available now because loading it from memory would destroy registers. 2586 __ push(0); 2587 2588 // Save everything in sight. 2589 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2590 2591 // Now it is safe to overwrite any register 2592 2593 // Deopt during an exception. Save exec mode for unpack_frames. 2594 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved 2595 2596 // load throwing pc from JavaThread and patch it as the return address 2597 // of the current frame. Then clear the field in JavaThread 2598 2599 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2600 __ movptr(Address(rbp, wordSize), rdx); 2601 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD); 2602 2603 #ifdef ASSERT 2604 // verify that there is really an exception oop in JavaThread 2605 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2606 __ verify_oop(rax); 2607 2608 // verify that there is no pending exception 2609 Label no_pending_exception; 2610 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 2611 __ testptr(rax, rax); 2612 __ jcc(Assembler::zero, no_pending_exception); 2613 __ stop("must not have pending exception here"); 2614 __ bind(no_pending_exception); 2615 #endif 2616 2617 __ bind(cont); 2618 2619 // Call C code. Need thread and this frame, but NOT official VM entry 2620 // crud. We cannot block on this call, no GC can happen. 2621 // 2622 // UnrollBlock* fetch_unroll_info(JavaThread* thread) 2623 2624 // fetch_unroll_info needs to call last_java_frame(). 2625 2626 __ set_last_Java_frame(noreg, noreg, NULL); 2627 #ifdef ASSERT 2628 { Label L; 2629 __ cmpptr(Address(r15_thread, 2630 JavaThread::last_Java_fp_offset()), 2631 (int32_t)0); 2632 __ jcc(Assembler::equal, L); 2633 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared"); 2634 __ bind(L); 2635 } 2636 #endif // ASSERT 2637 __ mov(c_rarg0, r15_thread); 2638 __ movl(c_rarg1, r14); // exec_mode 2639 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info))); 2640 2641 // Need to have an oopmap that tells fetch_unroll_info where to 2642 // find any register it might need. 2643 oop_maps->add_gc_map(__ pc() - start, map); 2644 2645 __ reset_last_Java_frame(false); 2646 2647 #if INCLUDE_JVMCI 2648 if (EnableJVMCI) { 2649 __ bind(after_fetch_unroll_info_call); 2650 } 2651 #endif 2652 2653 // Load UnrollBlock* into rdi 2654 __ mov(rdi, rax); 2655 2656 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes())); 2657 Label noException; 2658 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending? 2659 __ jcc(Assembler::notEqual, noException); 2660 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2661 // QQQ this is useless it was NULL above 2662 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2663 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD); 2664 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD); 2665 2666 __ verify_oop(rax); 2667 2668 // Overwrite the result registers with the exception results. 2669 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2670 // I think this is useless 2671 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx); 2672 2673 __ bind(noException); 2674 2675 // Only register save data is on the stack. 2676 // Now restore the result registers. Everything else is either dead 2677 // or captured in the vframeArray. 2678 RegisterSaver::restore_result_registers(masm); 2679 2680 // All of the register save area has been popped of the stack. Only the 2681 // return address remains. 2682 2683 // Pop all the frames we must move/replace. 2684 // 2685 // Frame picture (youngest to oldest) 2686 // 1: self-frame (no frame link) 2687 // 2: deopting frame (no frame link) 2688 // 3: caller of deopting frame (could be compiled/interpreted). 2689 // 2690 // Note: by leaving the return address of self-frame on the stack 2691 // and using the size of frame 2 to adjust the stack 2692 // when we are done the return to frame 3 will still be on the stack. 2693 2694 // Pop deoptimized frame 2695 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes())); 2696 __ addptr(rsp, rcx); 2697 2698 // rsp should be pointing at the return address to the caller (3) 2699 2700 // Pick up the initial fp we should save 2701 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2702 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes())); 2703 2704 #ifdef ASSERT 2705 // Compilers generate code that bang the stack by as much as the 2706 // interpreter would need. So this stack banging should never 2707 // trigger a fault. Verify that it does not on non product builds. 2708 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes())); 2709 __ bang_stack_size(rbx, rcx); 2710 #endif 2711 2712 // Load address of array of frame pcs into rcx 2713 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes())); 2714 2715 // Trash the old pc 2716 __ addptr(rsp, wordSize); 2717 2718 // Load address of array of frame sizes into rsi 2719 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes())); 2720 2721 // Load counter into rdx 2722 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes())); 2723 2724 // Now adjust the caller's stack to make up for the extra locals 2725 // but record the original sp so that we can save it in the skeletal interpreter 2726 // frame and the stack walking of interpreter_sender will get the unextended sp 2727 // value and not the "real" sp value. 2728 2729 const Register sender_sp = r8; 2730 2731 __ mov(sender_sp, rsp); 2732 __ movl(rbx, Address(rdi, 2733 Deoptimization::UnrollBlock:: 2734 caller_adjustment_offset_in_bytes())); 2735 __ subptr(rsp, rbx); 2736 2737 // Push interpreter frames in a loop 2738 Label loop; 2739 __ bind(loop); 2740 __ movptr(rbx, Address(rsi, 0)); // Load frame size 2741 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand 2742 __ pushptr(Address(rcx, 0)); // Save return address 2743 __ enter(); // Save old & set new ebp 2744 __ subptr(rsp, rbx); // Prolog 2745 // This value is corrected by layout_activation_impl 2746 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD ); 2747 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable 2748 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 2749 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 2750 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 2751 __ decrementl(rdx); // Decrement counter 2752 __ jcc(Assembler::notZero, loop); 2753 __ pushptr(Address(rcx, 0)); // Save final return address 2754 2755 // Re-push self-frame 2756 __ enter(); // Save old & set new ebp 2757 2758 // Allocate a full sized register save area. 2759 // Return address and rbp are in place, so we allocate two less words. 2760 __ subptr(rsp, (frame_size_in_words - 2) * wordSize); 2761 2762 // Restore frame locals after moving the frame 2763 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0); 2764 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2765 2766 // Call C code. Need thread but NOT official VM entry 2767 // crud. We cannot block on this call, no GC can happen. Call should 2768 // restore return values to their stack-slots with the new SP. 2769 // 2770 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode) 2771 2772 // Use rbp because the frames look interpreted now 2773 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 2774 // Don't need the precise return PC here, just precise enough to point into this code blob. 2775 address the_pc = __ pc(); 2776 __ set_last_Java_frame(noreg, rbp, the_pc); 2777 2778 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI 2779 __ mov(c_rarg0, r15_thread); 2780 __ movl(c_rarg1, r14); // second arg: exec_mode 2781 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 2782 // Revert SP alignment after call since we're going to do some SP relative addressing below 2783 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset())); 2784 2785 // Set an oopmap for the call site 2786 // Use the same PC we used for the last java frame 2787 oop_maps->add_gc_map(the_pc - start, 2788 new OopMap( frame_size_in_words, 0 )); 2789 2790 // Clear fp AND pc 2791 __ reset_last_Java_frame(true); 2792 2793 // Collect return values 2794 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes())); 2795 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes())); 2796 // I think this is useless (throwing pc?) 2797 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes())); 2798 2799 // Pop self-frame. 2800 __ leave(); // Epilog 2801 2802 // Jump to interpreter 2803 __ ret(0); 2804 2805 // Make sure all code is generated 2806 masm->flush(); 2807 2808 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words); 2809 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset); 2810 #if INCLUDE_JVMCI 2811 if (EnableJVMCI) { 2812 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset); 2813 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset); 2814 } 2815 #endif 2816 } 2817 2818 #ifdef COMPILER2 2819 //------------------------------generate_uncommon_trap_blob-------------------- 2820 void SharedRuntime::generate_uncommon_trap_blob() { 2821 // Allocate space for the code 2822 ResourceMark rm; 2823 // Setup code generation tools 2824 CodeBuffer buffer("uncommon_trap_blob", 2048, 1024); 2825 MacroAssembler* masm = new MacroAssembler(&buffer); 2826 2827 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 2828 2829 address start = __ pc(); 2830 2831 if (UseRTMLocking) { 2832 // Abort RTM transaction before possible nmethod deoptimization. 2833 __ xabort(0); 2834 } 2835 2836 // Push self-frame. We get here with a return address on the 2837 // stack, so rsp is 8-byte aligned until we allocate our frame. 2838 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog! 2839 2840 // No callee saved registers. rbp is assumed implicitly saved 2841 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 2842 2843 // compiler left unloaded_class_index in j_rarg0 move to where the 2844 // runtime expects it. 2845 __ movl(c_rarg1, j_rarg0); 2846 2847 __ set_last_Java_frame(noreg, noreg, NULL); 2848 2849 // Call C code. Need thread but NOT official VM entry 2850 // crud. We cannot block on this call, no GC can happen. Call should 2851 // capture callee-saved registers as well as return values. 2852 // Thread is in rdi already. 2853 // 2854 // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index); 2855 2856 __ mov(c_rarg0, r15_thread); 2857 __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap); 2858 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2859 2860 // Set an oopmap for the call site 2861 OopMapSet* oop_maps = new OopMapSet(); 2862 OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0); 2863 2864 // location of rbp is known implicitly by the frame sender code 2865 2866 oop_maps->add_gc_map(__ pc() - start, map); 2867 2868 __ reset_last_Java_frame(false); 2869 2870 // Load UnrollBlock* into rdi 2871 __ mov(rdi, rax); 2872 2873 #ifdef ASSERT 2874 { Label L; 2875 __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()), 2876 (int32_t)Deoptimization::Unpack_uncommon_trap); 2877 __ jcc(Assembler::equal, L); 2878 __ stop("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap"); 2879 __ bind(L); 2880 } 2881 #endif 2882 2883 // Pop all the frames we must move/replace. 2884 // 2885 // Frame picture (youngest to oldest) 2886 // 1: self-frame (no frame link) 2887 // 2: deopting frame (no frame link) 2888 // 3: caller of deopting frame (could be compiled/interpreted). 2889 2890 // Pop self-frame. We have no frame, and must rely only on rax and rsp. 2891 __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog! 2892 2893 // Pop deoptimized frame (int) 2894 __ movl(rcx, Address(rdi, 2895 Deoptimization::UnrollBlock:: 2896 size_of_deoptimized_frame_offset_in_bytes())); 2897 __ addptr(rsp, rcx); 2898 2899 // rsp should be pointing at the return address to the caller (3) 2900 2901 // Pick up the initial fp we should save 2902 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2903 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes())); 2904 2905 #ifdef ASSERT 2906 // Compilers generate code that bang the stack by as much as the 2907 // interpreter would need. So this stack banging should never 2908 // trigger a fault. Verify that it does not on non product builds. 2909 __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes())); 2910 __ bang_stack_size(rbx, rcx); 2911 #endif 2912 2913 // Load address of array of frame pcs into rcx (address*) 2914 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes())); 2915 2916 // Trash the return pc 2917 __ addptr(rsp, wordSize); 2918 2919 // Load address of array of frame sizes into rsi (intptr_t*) 2920 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes())); 2921 2922 // Counter 2923 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int) 2924 2925 // Now adjust the caller's stack to make up for the extra locals but 2926 // record the original sp so that we can save it in the skeletal 2927 // interpreter frame and the stack walking of interpreter_sender 2928 // will get the unextended sp value and not the "real" sp value. 2929 2930 const Register sender_sp = r8; 2931 2932 __ mov(sender_sp, rsp); 2933 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int) 2934 __ subptr(rsp, rbx); 2935 2936 // Push interpreter frames in a loop 2937 Label loop; 2938 __ bind(loop); 2939 __ movptr(rbx, Address(rsi, 0)); // Load frame size 2940 __ subptr(rbx, 2 * wordSize); // We'll push pc and rbp by hand 2941 __ pushptr(Address(rcx, 0)); // Save return address 2942 __ enter(); // Save old & set new rbp 2943 __ subptr(rsp, rbx); // Prolog 2944 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), 2945 sender_sp); // Make it walkable 2946 // This value is corrected by layout_activation_impl 2947 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD ); 2948 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 2949 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 2950 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 2951 __ decrementl(rdx); // Decrement counter 2952 __ jcc(Assembler::notZero, loop); 2953 __ pushptr(Address(rcx, 0)); // Save final return address 2954 2955 // Re-push self-frame 2956 __ enter(); // Save old & set new rbp 2957 __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt); 2958 // Prolog 2959 2960 // Use rbp because the frames look interpreted now 2961 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 2962 // Don't need the precise return PC here, just precise enough to point into this code blob. 2963 address the_pc = __ pc(); 2964 __ set_last_Java_frame(noreg, rbp, the_pc); 2965 2966 // Call C code. Need thread but NOT official VM entry 2967 // crud. We cannot block on this call, no GC can happen. Call should 2968 // restore return values to their stack-slots with the new SP. 2969 // Thread is in rdi already. 2970 // 2971 // BasicType unpack_frames(JavaThread* thread, int exec_mode); 2972 2973 __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI 2974 __ mov(c_rarg0, r15_thread); 2975 __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap); 2976 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 2977 2978 // Set an oopmap for the call site 2979 // Use the same PC we used for the last java frame 2980 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 2981 2982 // Clear fp AND pc 2983 __ reset_last_Java_frame(true); 2984 2985 // Pop self-frame. 2986 __ leave(); // Epilog 2987 2988 // Jump to interpreter 2989 __ ret(0); 2990 2991 // Make sure all code is generated 2992 masm->flush(); 2993 2994 _uncommon_trap_blob = UncommonTrapBlob::create(&buffer, oop_maps, 2995 SimpleRuntimeFrame::framesize >> 1); 2996 } 2997 #endif // COMPILER2 2998 2999 //------------------------------generate_handler_blob------ 3000 // 3001 // Generate a special Compile2Runtime blob that saves all registers, 3002 // and setup oopmap. 3003 // 3004 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) { 3005 assert(StubRoutines::forward_exception_entry() != NULL, 3006 "must be generated before"); 3007 3008 ResourceMark rm; 3009 OopMapSet *oop_maps = new OopMapSet(); 3010 OopMap* map; 3011 3012 // Allocate space for the code. Setup code generation tools. 3013 CodeBuffer buffer("handler_blob", 2048, 1024); 3014 MacroAssembler* masm = new MacroAssembler(&buffer); 3015 3016 address start = __ pc(); 3017 address call_pc = NULL; 3018 int frame_size_in_words; 3019 bool cause_return = (poll_type == POLL_AT_RETURN); 3020 bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP); 3021 3022 if (UseRTMLocking) { 3023 // Abort RTM transaction before calling runtime 3024 // because critical section will be large and will be 3025 // aborted anyway. Also nmethod could be deoptimized. 3026 __ xabort(0); 3027 } 3028 3029 // Make room for return address (or push it again) 3030 if (!cause_return) { 3031 __ push(rbx); 3032 } 3033 3034 // Save registers, fpu state, and flags 3035 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors); 3036 3037 // The following is basically a call_VM. However, we need the precise 3038 // address of the call in order to generate an oopmap. Hence, we do all the 3039 // work outselves. 3040 3041 __ set_last_Java_frame(noreg, noreg, NULL); 3042 3043 // The return address must always be correct so that frame constructor never 3044 // sees an invalid pc. 3045 3046 if (!cause_return) { 3047 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack. 3048 // Additionally, rbx is a callee saved register and we can look at it later to determine 3049 // if someone changed the return address for us! 3050 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset())); 3051 __ movptr(Address(rbp, wordSize), rbx); 3052 } 3053 3054 // Do the call 3055 __ mov(c_rarg0, r15_thread); 3056 __ call(RuntimeAddress(call_ptr)); 3057 3058 // Set an oopmap for the call site. This oopmap will map all 3059 // oop-registers and debug-info registers as callee-saved. This 3060 // will allow deoptimization at this safepoint to find all possible 3061 // debug-info recordings, as well as let GC find all oops. 3062 3063 oop_maps->add_gc_map( __ pc() - start, map); 3064 3065 Label noException; 3066 3067 __ reset_last_Java_frame(false); 3068 3069 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD); 3070 __ jcc(Assembler::equal, noException); 3071 3072 // Exception pending 3073 3074 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3075 3076 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3077 3078 // No exception case 3079 __ bind(noException); 3080 3081 Label no_adjust; 3082 #ifdef ASSERT 3083 Label bail; 3084 #endif 3085 if (!cause_return) { 3086 Label no_prefix, not_special; 3087 3088 // If our stashed return pc was modified by the runtime we avoid touching it 3089 __ cmpptr(rbx, Address(rbp, wordSize)); 3090 __ jccb(Assembler::notEqual, no_adjust); 3091 3092 // Skip over the poll instruction. 3093 // See NativeInstruction::is_safepoint_poll() 3094 // Possible encodings: 3095 // 85 00 test %eax,(%rax) 3096 // 85 01 test %eax,(%rcx) 3097 // 85 02 test %eax,(%rdx) 3098 // 85 03 test %eax,(%rbx) 3099 // 85 06 test %eax,(%rsi) 3100 // 85 07 test %eax,(%rdi) 3101 // 3102 // 41 85 00 test %eax,(%r8) 3103 // 41 85 01 test %eax,(%r9) 3104 // 41 85 02 test %eax,(%r10) 3105 // 41 85 03 test %eax,(%r11) 3106 // 41 85 06 test %eax,(%r14) 3107 // 41 85 07 test %eax,(%r15) 3108 // 3109 // 85 04 24 test %eax,(%rsp) 3110 // 41 85 04 24 test %eax,(%r12) 3111 // 85 45 00 test %eax,0x0(%rbp) 3112 // 41 85 45 00 test %eax,0x0(%r13) 3113 3114 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix); 3115 __ jcc(Assembler::notEqual, no_prefix); 3116 __ addptr(rbx, 1); 3117 __ bind(no_prefix); 3118 #ifdef ASSERT 3119 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below 3120 #endif 3121 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values: 3122 // r12/rsp 0x04 3123 // r13/rbp 0x05 3124 __ movzbq(rcx, Address(rbx, 1)); 3125 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05 3126 __ subptr(rcx, 4); // looking for 0x00 .. 0x01 3127 __ cmpptr(rcx, 1); 3128 __ jcc(Assembler::above, not_special); 3129 __ addptr(rbx, 1); 3130 __ bind(not_special); 3131 #ifdef ASSERT 3132 // Verify the correct encoding of the poll we're about to skip. 3133 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl); 3134 __ jcc(Assembler::notEqual, bail); 3135 // Mask out the modrm bits 3136 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask); 3137 // rax encodes to 0, so if the bits are nonzero it's incorrect 3138 __ jcc(Assembler::notZero, bail); 3139 #endif 3140 // Adjust return pc forward to step over the safepoint poll instruction 3141 __ addptr(rbx, 2); 3142 __ movptr(Address(rbp, wordSize), rbx); 3143 } 3144 3145 __ bind(no_adjust); 3146 // Normal exit, restore registers and exit. 3147 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3148 __ ret(0); 3149 3150 #ifdef ASSERT 3151 __ bind(bail); 3152 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected"); 3153 #endif 3154 3155 // Make sure all code is generated 3156 masm->flush(); 3157 3158 // Fill-out other meta info 3159 return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words); 3160 } 3161 3162 // 3163 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss 3164 // 3165 // Generate a stub that calls into vm to find out the proper destination 3166 // of a java call. All the argument registers are live at this point 3167 // but since this is generic code we don't know what they are and the caller 3168 // must do any gc of the args. 3169 // 3170 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) { 3171 assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before"); 3172 3173 // allocate space for the code 3174 ResourceMark rm; 3175 3176 CodeBuffer buffer(name, 1000, 512); 3177 MacroAssembler* masm = new MacroAssembler(&buffer); 3178 3179 int frame_size_in_words; 3180 3181 OopMapSet *oop_maps = new OopMapSet(); 3182 OopMap* map = NULL; 3183 3184 int start = __ offset(); 3185 3186 // No need to save vector registers since they are caller-saved anyway. 3187 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false); 3188 3189 int frame_complete = __ offset(); 3190 3191 __ set_last_Java_frame(noreg, noreg, NULL); 3192 3193 __ mov(c_rarg0, r15_thread); 3194 3195 __ call(RuntimeAddress(destination)); 3196 3197 3198 // Set an oopmap for the call site. 3199 // We need this not only for callee-saved registers, but also for volatile 3200 // registers that the compiler might be keeping live across a safepoint. 3201 3202 oop_maps->add_gc_map( __ offset() - start, map); 3203 3204 // rax contains the address we are going to jump to assuming no exception got installed 3205 3206 // clear last_Java_sp 3207 __ reset_last_Java_frame(false); 3208 // check for pending exceptions 3209 Label pending; 3210 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD); 3211 __ jcc(Assembler::notEqual, pending); 3212 3213 // get the returned Method* 3214 __ get_vm_result_2(rbx, r15_thread); 3215 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx); 3216 3217 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 3218 3219 RegisterSaver::restore_live_registers(masm); 3220 3221 // We are back the the original state on entry and ready to go. 3222 3223 __ jmp(rax); 3224 3225 // Pending exception after the safepoint 3226 3227 __ bind(pending); 3228 3229 RegisterSaver::restore_live_registers(masm); 3230 3231 // exception pending => remove activation and forward to exception handler 3232 3233 __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD); 3234 3235 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 3236 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3237 3238 // ------------- 3239 // make sure all code is generated 3240 masm->flush(); 3241 3242 // return the blob 3243 // frame_size_words or bytes?? 3244 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); 3245 } 3246 3247 #ifdef COMPILER2 3248 static const int native_invoker_code_size = MethodHandles::adapter_code_size; 3249 3250 class NativeInvokerGenerator : public StubCodeGenerator { 3251 address _call_target; 3252 int _shadow_space_bytes; 3253 3254 const GrowableArray<VMReg>& _input_registers; 3255 const GrowableArray<VMReg>& _output_registers; 3256 3257 int _frame_complete; 3258 int _framesize; 3259 OopMapSet* _oop_maps; 3260 public: 3261 NativeInvokerGenerator(CodeBuffer* buffer, 3262 address call_target, 3263 int shadow_space_bytes, 3264 const GrowableArray<VMReg>& input_registers, 3265 const GrowableArray<VMReg>& output_registers) 3266 : StubCodeGenerator(buffer, PrintMethodHandleStubs), 3267 _call_target(call_target), 3268 _shadow_space_bytes(shadow_space_bytes), 3269 _input_registers(input_registers), 3270 _output_registers(output_registers), 3271 _frame_complete(0), 3272 _framesize(0), 3273 _oop_maps(NULL) { 3274 assert(_output_registers.length() <= 1 3275 || (_output_registers.length() == 2 && !_output_registers.at(1)->is_valid()), "no multi-reg returns"); 3276 3277 } 3278 3279 void generate(); 3280 3281 int spill_size_in_bytes() const { 3282 if (_output_registers.length() == 0) { 3283 return 0; 3284 } 3285 VMReg reg = _output_registers.at(0); 3286 assert(reg->is_reg(), "must be a register"); 3287 if (reg->is_Register()) { 3288 return 8; 3289 } else if (reg->is_XMMRegister()) { 3290 if (UseAVX >= 3) { 3291 return 64; 3292 } else if (UseAVX >= 1) { 3293 return 32; 3294 } else { 3295 return 16; 3296 } 3297 } else { 3298 ShouldNotReachHere(); 3299 } 3300 return 0; 3301 } 3302 3303 void spill_out_registers() { 3304 if (_output_registers.length() == 0) { 3305 return; 3306 } 3307 VMReg reg = _output_registers.at(0); 3308 assert(reg->is_reg(), "must be a register"); 3309 MacroAssembler* masm = _masm; 3310 if (reg->is_Register()) { 3311 __ movptr(Address(rsp, 0), reg->as_Register()); 3312 } else if (reg->is_XMMRegister()) { 3313 if (UseAVX >= 3) { 3314 __ evmovdqul(Address(rsp, 0), reg->as_XMMRegister(), Assembler::AVX_512bit); 3315 } else if (UseAVX >= 1) { 3316 __ vmovdqu(Address(rsp, 0), reg->as_XMMRegister()); 3317 } else { 3318 __ movdqu(Address(rsp, 0), reg->as_XMMRegister()); 3319 } 3320 } else { 3321 ShouldNotReachHere(); 3322 } 3323 } 3324 3325 void fill_out_registers() { 3326 if (_output_registers.length() == 0) { 3327 return; 3328 } 3329 VMReg reg = _output_registers.at(0); 3330 assert(reg->is_reg(), "must be a register"); 3331 MacroAssembler* masm = _masm; 3332 if (reg->is_Register()) { 3333 __ movptr(reg->as_Register(), Address(rsp, 0)); 3334 } else if (reg->is_XMMRegister()) { 3335 if (UseAVX >= 3) { 3336 __ evmovdqul(reg->as_XMMRegister(), Address(rsp, 0), Assembler::AVX_512bit); 3337 } else if (UseAVX >= 1) { 3338 __ vmovdqu(reg->as_XMMRegister(), Address(rsp, 0)); 3339 } else { 3340 __ movdqu(reg->as_XMMRegister(), Address(rsp, 0)); 3341 } 3342 } else { 3343 ShouldNotReachHere(); 3344 } 3345 } 3346 3347 int frame_complete() const { 3348 return _frame_complete; 3349 } 3350 3351 int framesize() const { 3352 return (_framesize >> (LogBytesPerWord - LogBytesPerInt)); 3353 } 3354 3355 OopMapSet* oop_maps() const { 3356 return _oop_maps; 3357 } 3358 3359 private: 3360 #ifdef ASSERT 3361 bool target_uses_register(VMReg reg) { 3362 return _input_registers.contains(reg) || _output_registers.contains(reg); 3363 } 3364 #endif 3365 }; 3366 3367 RuntimeStub* SharedRuntime::make_native_invoker(address call_target, 3368 int shadow_space_bytes, 3369 const GrowableArray<VMReg>& input_registers, 3370 const GrowableArray<VMReg>& output_registers) { 3371 int locs_size = 64; 3372 CodeBuffer code("nep_invoker_blob", native_invoker_code_size, locs_size); 3373 NativeInvokerGenerator g(&code, call_target, shadow_space_bytes, input_registers, output_registers); 3374 g.generate(); 3375 code.log_section_sizes("nep_invoker_blob"); 3376 3377 RuntimeStub* stub = 3378 RuntimeStub::new_runtime_stub("nep_invoker_blob", 3379 &code, 3380 g.frame_complete(), 3381 g.framesize(), 3382 g.oop_maps(), false); 3383 return stub; 3384 } 3385 3386 void NativeInvokerGenerator::generate() { 3387 assert(!(target_uses_register(r15_thread->as_VMReg()) || target_uses_register(rscratch1->as_VMReg())), "Register conflict"); 3388 3389 enum layout { 3390 rbp_off, 3391 rbp_off2, 3392 return_off, 3393 return_off2, 3394 framesize // inclusive of return address 3395 }; 3396 3397 _framesize = align_up(framesize + ((_shadow_space_bytes + spill_size_in_bytes()) >> LogBytesPerInt), 4); 3398 assert(is_even(_framesize/2), "sp not 16-byte aligned"); 3399 3400 _oop_maps = new OopMapSet(); 3401 MacroAssembler* masm = _masm; 3402 3403 address start = __ pc(); 3404 3405 __ enter(); 3406 3407 // return address and rbp are already in place 3408 __ subptr(rsp, (_framesize-4) << LogBytesPerInt); // prolog 3409 3410 _frame_complete = __ pc() - start; 3411 3412 address the_pc = __ pc(); 3413 3414 __ set_last_Java_frame(rsp, rbp, (address)the_pc); 3415 OopMap* map = new OopMap(_framesize, 0); 3416 _oop_maps->add_gc_map(the_pc - start, map); 3417 3418 // State transition 3419 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 3420 3421 __ call(RuntimeAddress(_call_target)); 3422 3423 __ restore_cpu_control_state_after_jni(); 3424 3425 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 3426 3427 // Force this write out before the read below 3428 __ membar(Assembler::Membar_mask_bits( 3429 Assembler::LoadLoad | Assembler::LoadStore | 3430 Assembler::StoreLoad | Assembler::StoreStore)); 3431 3432 Label L_after_safepoint_poll; 3433 Label L_safepoint_poll_slow_path; 3434 3435 __ safepoint_poll(L_safepoint_poll_slow_path, r15_thread, true /* at_return */, false /* in_nmethod */); 3436 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 3437 __ jcc(Assembler::notEqual, L_safepoint_poll_slow_path); 3438 3439 __ bind(L_after_safepoint_poll); 3440 3441 // change thread state 3442 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 3443 3444 __ block_comment("reguard stack check"); 3445 Label L_reguard; 3446 Label L_after_reguard; 3447 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 3448 __ jcc(Assembler::equal, L_reguard); 3449 __ bind(L_after_reguard); 3450 3451 __ reset_last_Java_frame(r15_thread, true); 3452 3453 __ leave(); // required for proper stackwalking of RuntimeStub frame 3454 __ ret(0); 3455 3456 ////////////////////////////////////////////////////////////////////////////// 3457 3458 __ block_comment("{ L_safepoint_poll_slow_path"); 3459 __ bind(L_safepoint_poll_slow_path); 3460 __ vzeroupper(); 3461 3462 spill_out_registers(); 3463 3464 __ mov(c_rarg0, r15_thread); 3465 __ mov(r12, rsp); // remember sp 3466 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 3467 __ andptr(rsp, -16); // align stack as required by ABI 3468 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 3469 __ mov(rsp, r12); // restore sp 3470 __ reinit_heapbase(); 3471 3472 fill_out_registers(); 3473 3474 __ jmp(L_after_safepoint_poll); 3475 __ block_comment("} L_safepoint_poll_slow_path"); 3476 3477 ////////////////////////////////////////////////////////////////////////////// 3478 3479 __ block_comment("{ L_reguard"); 3480 __ bind(L_reguard); 3481 __ vzeroupper(); 3482 3483 spill_out_registers(); 3484 3485 __ mov(r12, rsp); // remember sp 3486 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 3487 __ andptr(rsp, -16); // align stack as required by ABI 3488 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 3489 __ mov(rsp, r12); // restore sp 3490 __ reinit_heapbase(); 3491 3492 fill_out_registers(); 3493 3494 __ jmp(L_after_reguard); 3495 3496 __ block_comment("} L_reguard"); 3497 3498 ////////////////////////////////////////////////////////////////////////////// 3499 3500 __ flush(); 3501 } 3502 #endif // COMPILER2 3503 3504 //------------------------------Montgomery multiplication------------------------ 3505 // 3506 3507 #ifndef _WINDOWS 3508 3509 // Subtract 0:b from carry:a. Return carry. 3510 static julong 3511 sub(julong a[], julong b[], julong carry, long len) { 3512 long long i = 0, cnt = len; 3513 julong tmp; 3514 asm volatile("clc; " 3515 "0: ; " 3516 "mov (%[b], %[i], 8), %[tmp]; " 3517 "sbb %[tmp], (%[a], %[i], 8); " 3518 "inc %[i]; dec %[cnt]; " 3519 "jne 0b; " 3520 "mov %[carry], %[tmp]; sbb $0, %[tmp]; " 3521 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp) 3522 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry) 3523 : "memory"); 3524 return tmp; 3525 } 3526 3527 // Multiply (unsigned) Long A by Long B, accumulating the double- 3528 // length result into the accumulator formed of T0, T1, and T2. 3529 #define MACC(A, B, T0, T1, T2) \ 3530 do { \ 3531 unsigned long hi, lo; \ 3532 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3533 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3534 : "r"(A), "a"(B) : "cc"); \ 3535 } while(0) 3536 3537 // As above, but add twice the double-length result into the 3538 // accumulator. 3539 #define MACC2(A, B, T0, T1, T2) \ 3540 do { \ 3541 unsigned long hi, lo; \ 3542 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \ 3543 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3544 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3545 : "r"(A), "a"(B) : "cc"); \ 3546 } while(0) 3547 3548 #else //_WINDOWS 3549 3550 static julong 3551 sub(julong a[], julong b[], julong carry, long len) { 3552 long i; 3553 julong tmp; 3554 unsigned char c = 1; 3555 for (i = 0; i < len; i++) { 3556 c = _addcarry_u64(c, a[i], ~b[i], &tmp); 3557 a[i] = tmp; 3558 } 3559 c = _addcarry_u64(c, carry, ~0, &tmp); 3560 return tmp; 3561 } 3562 3563 // Multiply (unsigned) Long A by Long B, accumulating the double- 3564 // length result into the accumulator formed of T0, T1, and T2. 3565 #define MACC(A, B, T0, T1, T2) \ 3566 do { \ 3567 julong hi, lo; \ 3568 lo = _umul128(A, B, &hi); \ 3569 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3570 c = _addcarry_u64(c, hi, T1, &T1); \ 3571 _addcarry_u64(c, T2, 0, &T2); \ 3572 } while(0) 3573 3574 // As above, but add twice the double-length result into the 3575 // accumulator. 3576 #define MACC2(A, B, T0, T1, T2) \ 3577 do { \ 3578 julong hi, lo; \ 3579 lo = _umul128(A, B, &hi); \ 3580 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3581 c = _addcarry_u64(c, hi, T1, &T1); \ 3582 _addcarry_u64(c, T2, 0, &T2); \ 3583 c = _addcarry_u64(0, lo, T0, &T0); \ 3584 c = _addcarry_u64(c, hi, T1, &T1); \ 3585 _addcarry_u64(c, T2, 0, &T2); \ 3586 } while(0) 3587 3588 #endif //_WINDOWS 3589 3590 // Fast Montgomery multiplication. The derivation of the algorithm is 3591 // in A Cryptographic Library for the Motorola DSP56000, 3592 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3593 3594 static void NOINLINE 3595 montgomery_multiply(julong a[], julong b[], julong n[], 3596 julong m[], julong inv, int len) { 3597 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3598 int i; 3599 3600 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply"); 3601 3602 for (i = 0; i < len; i++) { 3603 int j; 3604 for (j = 0; j < i; j++) { 3605 MACC(a[j], b[i-j], t0, t1, t2); 3606 MACC(m[j], n[i-j], t0, t1, t2); 3607 } 3608 MACC(a[i], b[0], t0, t1, t2); 3609 m[i] = t0 * inv; 3610 MACC(m[i], n[0], t0, t1, t2); 3611 3612 assert(t0 == 0, "broken Montgomery multiply"); 3613 3614 t0 = t1; t1 = t2; t2 = 0; 3615 } 3616 3617 for (i = len; i < 2*len; i++) { 3618 int j; 3619 for (j = i-len+1; j < len; j++) { 3620 MACC(a[j], b[i-j], t0, t1, t2); 3621 MACC(m[j], n[i-j], t0, t1, t2); 3622 } 3623 m[i-len] = t0; 3624 t0 = t1; t1 = t2; t2 = 0; 3625 } 3626 3627 while (t0) 3628 t0 = sub(m, n, t0, len); 3629 } 3630 3631 // Fast Montgomery squaring. This uses asymptotically 25% fewer 3632 // multiplies so it should be up to 25% faster than Montgomery 3633 // multiplication. However, its loop control is more complex and it 3634 // may actually run slower on some machines. 3635 3636 static void NOINLINE 3637 montgomery_square(julong a[], julong n[], 3638 julong m[], julong inv, int len) { 3639 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3640 int i; 3641 3642 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square"); 3643 3644 for (i = 0; i < len; i++) { 3645 int j; 3646 int end = (i+1)/2; 3647 for (j = 0; j < end; j++) { 3648 MACC2(a[j], a[i-j], t0, t1, t2); 3649 MACC(m[j], n[i-j], t0, t1, t2); 3650 } 3651 if ((i & 1) == 0) { 3652 MACC(a[j], a[j], t0, t1, t2); 3653 } 3654 for (; j < i; j++) { 3655 MACC(m[j], n[i-j], t0, t1, t2); 3656 } 3657 m[i] = t0 * inv; 3658 MACC(m[i], n[0], t0, t1, t2); 3659 3660 assert(t0 == 0, "broken Montgomery square"); 3661 3662 t0 = t1; t1 = t2; t2 = 0; 3663 } 3664 3665 for (i = len; i < 2*len; i++) { 3666 int start = i-len+1; 3667 int end = start + (len - start)/2; 3668 int j; 3669 for (j = start; j < end; j++) { 3670 MACC2(a[j], a[i-j], t0, t1, t2); 3671 MACC(m[j], n[i-j], t0, t1, t2); 3672 } 3673 if ((i & 1) == 0) { 3674 MACC(a[j], a[j], t0, t1, t2); 3675 } 3676 for (; j < len; j++) { 3677 MACC(m[j], n[i-j], t0, t1, t2); 3678 } 3679 m[i-len] = t0; 3680 t0 = t1; t1 = t2; t2 = 0; 3681 } 3682 3683 while (t0) 3684 t0 = sub(m, n, t0, len); 3685 } 3686 3687 // Swap words in a longword. 3688 static julong swap(julong x) { 3689 return (x << 32) | (x >> 32); 3690 } 3691 3692 // Copy len longwords from s to d, word-swapping as we go. The 3693 // destination array is reversed. 3694 static void reverse_words(julong *s, julong *d, int len) { 3695 d += len; 3696 while(len-- > 0) { 3697 d--; 3698 *d = swap(*s); 3699 s++; 3700 } 3701 } 3702 3703 // The threshold at which squaring is advantageous was determined 3704 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. 3705 #define MONTGOMERY_SQUARING_THRESHOLD 64 3706 3707 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, 3708 jint len, jlong inv, 3709 jint *m_ints) { 3710 assert(len % 2 == 0, "array length in montgomery_multiply must be even"); 3711 int longwords = len/2; 3712 3713 // Make very sure we don't use so much space that the stack might 3714 // overflow. 512 jints corresponds to an 16384-bit integer and 3715 // will use here a total of 8k bytes of stack space. 3716 int divisor = sizeof(julong) * 4; 3717 guarantee(longwords <= 8192 / divisor, "must be"); 3718 int total_allocation = longwords * sizeof (julong) * 4; 3719 julong *scratch = (julong *)alloca(total_allocation); 3720 3721 // Local scratch arrays 3722 julong 3723 *a = scratch + 0 * longwords, 3724 *b = scratch + 1 * longwords, 3725 *n = scratch + 2 * longwords, 3726 *m = scratch + 3 * longwords; 3727 3728 reverse_words((julong *)a_ints, a, longwords); 3729 reverse_words((julong *)b_ints, b, longwords); 3730 reverse_words((julong *)n_ints, n, longwords); 3731 3732 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords); 3733 3734 reverse_words(m, (julong *)m_ints, longwords); 3735 } 3736 3737 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, 3738 jint len, jlong inv, 3739 jint *m_ints) { 3740 assert(len % 2 == 0, "array length in montgomery_square must be even"); 3741 int longwords = len/2; 3742 3743 // Make very sure we don't use so much space that the stack might 3744 // overflow. 512 jints corresponds to an 16384-bit integer and 3745 // will use here a total of 6k bytes of stack space. 3746 int divisor = sizeof(julong) * 3; 3747 guarantee(longwords <= (8192 / divisor), "must be"); 3748 int total_allocation = longwords * sizeof (julong) * 3; 3749 julong *scratch = (julong *)alloca(total_allocation); 3750 3751 // Local scratch arrays 3752 julong 3753 *a = scratch + 0 * longwords, 3754 *n = scratch + 1 * longwords, 3755 *m = scratch + 2 * longwords; 3756 3757 reverse_words((julong *)a_ints, a, longwords); 3758 reverse_words((julong *)n_ints, n, longwords); 3759 3760 if (len >= MONTGOMERY_SQUARING_THRESHOLD) { 3761 ::montgomery_square(a, n, m, (julong)inv, longwords); 3762 } else { 3763 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords); 3764 } 3765 3766 reverse_words(m, (julong *)m_ints, longwords); 3767 } 3768 3769 #ifdef COMPILER2 3770 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame 3771 // 3772 //------------------------------generate_exception_blob--------------------------- 3773 // creates exception blob at the end 3774 // Using exception blob, this code is jumped from a compiled method. 3775 // (see emit_exception_handler in x86_64.ad file) 3776 // 3777 // Given an exception pc at a call we call into the runtime for the 3778 // handler in this method. This handler might merely restore state 3779 // (i.e. callee save registers) unwind the frame and jump to the 3780 // exception handler for the nmethod if there is no Java level handler 3781 // for the nmethod. 3782 // 3783 // This code is entered with a jmp. 3784 // 3785 // Arguments: 3786 // rax: exception oop 3787 // rdx: exception pc 3788 // 3789 // Results: 3790 // rax: exception oop 3791 // rdx: exception pc in caller or ??? 3792 // destination: exception handler of caller 3793 // 3794 // Note: the exception pc MUST be at a call (precise debug information) 3795 // Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved. 3796 // 3797 3798 void OptoRuntime::generate_exception_blob() { 3799 assert(!OptoRuntime::is_callee_saved_register(RDX_num), ""); 3800 assert(!OptoRuntime::is_callee_saved_register(RAX_num), ""); 3801 assert(!OptoRuntime::is_callee_saved_register(RCX_num), ""); 3802 3803 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 3804 3805 // Allocate space for the code 3806 ResourceMark rm; 3807 // Setup code generation tools 3808 CodeBuffer buffer("exception_blob", 2048, 1024); 3809 MacroAssembler* masm = new MacroAssembler(&buffer); 3810 3811 3812 address start = __ pc(); 3813 3814 // Exception pc is 'return address' for stack walker 3815 __ push(rdx); 3816 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog 3817 3818 // Save callee-saved registers. See x86_64.ad. 3819 3820 // rbp is an implicitly saved callee saved register (i.e., the calling 3821 // convention will save/restore it in the prolog/epilog). Other than that 3822 // there are no callee save registers now that adapter frames are gone. 3823 3824 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 3825 3826 // Store exception in Thread object. We cannot pass any arguments to the 3827 // handle_exception call, since we do not want to make any assumption 3828 // about the size of the frame where the exception happened in. 3829 // c_rarg0 is either rdi (Linux) or rcx (Windows). 3830 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax); 3831 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 3832 3833 // This call does all the hard work. It checks if an exception handler 3834 // exists in the method. 3835 // If so, it returns the handler address. 3836 // If not, it prepares for stack-unwinding, restoring the callee-save 3837 // registers of the frame being removed. 3838 // 3839 // address OptoRuntime::handle_exception_C(JavaThread* thread) 3840 3841 // At a method handle call, the stack may not be properly aligned 3842 // when returning with an exception. 3843 address the_pc = __ pc(); 3844 __ set_last_Java_frame(noreg, noreg, the_pc); 3845 __ mov(c_rarg0, r15_thread); 3846 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack 3847 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C))); 3848 3849 // Set an oopmap for the call site. This oopmap will only be used if we 3850 // are unwinding the stack. Hence, all locations will be dead. 3851 // Callee-saved registers will be the same as the frame above (i.e., 3852 // handle_exception_stub), since they were restored when we got the 3853 // exception. 3854 3855 OopMapSet* oop_maps = new OopMapSet(); 3856 3857 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 3858 3859 __ reset_last_Java_frame(false); 3860 3861 // Restore callee-saved registers 3862 3863 // rbp is an implicitly saved callee-saved register (i.e., the calling 3864 // convention will save restore it in prolog/epilog) Other than that 3865 // there are no callee save registers now that adapter frames are gone. 3866 3867 __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt)); 3868 3869 __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog 3870 __ pop(rdx); // No need for exception pc anymore 3871 3872 // rax: exception handler 3873 3874 // We have a handler in rax (could be deopt blob). 3875 __ mov(r8, rax); 3876 3877 // Get the exception oop 3878 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 3879 // Get the exception pc in case we are deoptimized 3880 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 3881 #ifdef ASSERT 3882 __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD); 3883 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD); 3884 #endif 3885 // Clear the exception oop so GC no longer processes it as a root. 3886 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD); 3887 3888 // rax: exception oop 3889 // r8: exception handler 3890 // rdx: exception pc 3891 // Jump to handler 3892 3893 __ jmp(r8); 3894 3895 // Make sure all code is generated 3896 masm->flush(); 3897 3898 // Set exception blob 3899 _exception_blob = ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1); 3900 } 3901 #endif // COMPILER2 3902 3903 void SharedRuntime::compute_move_order(const BasicType* in_sig_bt, 3904 int total_in_args, const VMRegPair* in_regs, 3905 int total_out_args, VMRegPair* out_regs, 3906 GrowableArray<int>& arg_order, 3907 VMRegPair tmp_vmreg) { 3908 ComputeMoveOrder order(total_in_args, in_regs, 3909 total_out_args, out_regs, 3910 in_sig_bt, arg_order, tmp_vmreg); 3911 }