1 /* 2 * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #ifndef _WINDOWS 27 #include "alloca.h" 28 #endif 29 #include "asm/macroAssembler.hpp" 30 #include "asm/macroAssembler.inline.hpp" 31 #include "code/debugInfoRec.hpp" 32 #include "code/icBuffer.hpp" 33 #include "code/nativeInst.hpp" 34 #include "code/vtableStubs.hpp" 35 #include "compiler/oopMap.hpp" 36 #include "gc/shared/collectedHeap.hpp" 37 #include "gc/shared/gcLocker.hpp" 38 #include "gc/shared/barrierSet.hpp" 39 #include "gc/shared/barrierSetAssembler.hpp" 40 #include "interpreter/interpreter.hpp" 41 #include "logging/log.hpp" 42 #include "memory/resourceArea.hpp" 43 #include "memory/universe.hpp" 44 #include "oops/compiledICHolder.hpp" 45 #include "oops/klass.inline.hpp" 46 #include "prims/methodHandles.hpp" 47 #include "runtime/jniHandles.hpp" 48 #include "runtime/safepointMechanism.hpp" 49 #include "runtime/sharedRuntime.hpp" 50 #include "runtime/signature.hpp" 51 #include "runtime/stubRoutines.hpp" 52 #include "runtime/vframeArray.hpp" 53 #include "runtime/vm_version.hpp" 54 #include "utilities/align.hpp" 55 #include "utilities/formatBuffer.hpp" 56 #include "vmreg_x86.inline.hpp" 57 #ifdef COMPILER1 58 #include "c1/c1_Runtime1.hpp" 59 #endif 60 #ifdef COMPILER2 61 #include "opto/runtime.hpp" 62 #endif 63 #if INCLUDE_JVMCI 64 #include "jvmci/jvmciJavaClasses.hpp" 65 #endif 66 67 #define __ masm-> 68 69 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size; 70 71 class SimpleRuntimeFrame { 72 73 public: 74 75 // Most of the runtime stubs have this simple frame layout. 76 // This class exists to make the layout shared in one place. 77 // Offsets are for compiler stack slots, which are jints. 78 enum layout { 79 // The frame sender code expects that rbp will be in the "natural" place and 80 // will override any oopMap setting for it. We must therefore force the layout 81 // so that it agrees with the frame sender code. 82 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt, 83 rbp_off2, 84 return_off, return_off2, 85 framesize 86 }; 87 }; 88 89 class RegisterSaver { 90 // Capture info about frame layout. Layout offsets are in jint 91 // units because compiler frame slots are jints. 92 #define XSAVE_AREA_BEGIN 160 93 #define XSAVE_AREA_YMM_BEGIN 576 94 #define XSAVE_AREA_OPMASK_BEGIN 1088 95 #define XSAVE_AREA_ZMM_BEGIN 1152 96 #define XSAVE_AREA_UPPERBANK 1664 97 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off 98 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off 99 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off 100 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off 101 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off 102 enum layout { 103 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area 104 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area 105 DEF_XMM_OFFS(0), 106 DEF_XMM_OFFS(1), 107 // 2..15 are implied in range usage 108 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 109 DEF_YMM_OFFS(0), 110 DEF_YMM_OFFS(1), 111 // 2..15 are implied in range usage 112 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 113 DEF_OPMASK_OFFS(0), 114 DEF_OPMASK_OFFS(1), 115 // 2..7 are implied in range usage 116 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 117 DEF_ZMM_OFFS(0), 118 DEF_ZMM_OFFS(1), 119 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt, 120 DEF_ZMM_UPPER_OFFS(16), 121 DEF_ZMM_UPPER_OFFS(17), 122 // 18..31 are implied in range usage 123 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt), 124 fpu_stateH_end, 125 r15_off, r15H_off, 126 r14_off, r14H_off, 127 r13_off, r13H_off, 128 r12_off, r12H_off, 129 r11_off, r11H_off, 130 r10_off, r10H_off, 131 r9_off, r9H_off, 132 r8_off, r8H_off, 133 rdi_off, rdiH_off, 134 rsi_off, rsiH_off, 135 ignore_off, ignoreH_off, // extra copy of rbp 136 rsp_off, rspH_off, 137 rbx_off, rbxH_off, 138 rdx_off, rdxH_off, 139 rcx_off, rcxH_off, 140 rax_off, raxH_off, 141 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state 142 align_off, alignH_off, 143 flags_off, flagsH_off, 144 // The frame sender code expects that rbp will be in the "natural" place and 145 // will override any oopMap setting for it. We must therefore force the layout 146 // so that it agrees with the frame sender code. 147 rbp_off, rbpH_off, // copy of rbp we will restore 148 return_off, returnH_off, // slot for return address 149 reg_save_size // size in compiler stack slots 150 }; 151 152 public: 153 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors); 154 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false); 155 156 // Offsets into the register save area 157 // Used by deoptimization when it is managing result register 158 // values on its own 159 160 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; } 161 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; } 162 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; } 163 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; } 164 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; } 165 166 // During deoptimization only the result registers need to be restored, 167 // all the other values have already been extracted. 168 static void restore_result_registers(MacroAssembler* masm); 169 }; 170 171 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) { 172 int off = 0; 173 int num_xmm_regs = XMMRegisterImpl::number_of_registers; 174 if (UseAVX < 3) { 175 num_xmm_regs = num_xmm_regs/2; 176 } 177 #if COMPILER2_OR_JVMCI 178 if (save_wide_vectors && UseAVX == 0) { 179 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX 180 } 181 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 182 #else 183 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI 184 #endif 185 186 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated 187 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs); 188 // OopMap frame size is in compiler stack slots (jint's) not bytes or words 189 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 190 // CodeBlob frame size is in words. 191 int frame_size_in_words = frame_size_in_bytes / wordSize; 192 *total_frame_words = frame_size_in_words; 193 194 // Save registers, fpu state, and flags. 195 // We assume caller has already pushed the return address onto the 196 // stack, so rsp is 8-byte aligned here. 197 // We push rpb twice in this sequence because we want the real rbp 198 // to be under the return like a normal enter. 199 200 __ enter(); // rsp becomes 16-byte aligned here 201 __ push_CPU_state(); // Push a multiple of 16 bytes 202 203 // push cpu state handles this on EVEX enabled targets 204 if (save_wide_vectors) { 205 // Save upper half of YMM registers(0..15) 206 int base_addr = XSAVE_AREA_YMM_BEGIN; 207 for (int n = 0; n < 16; n++) { 208 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n)); 209 } 210 if (VM_Version::supports_evex()) { 211 // Save upper half of ZMM registers(0..15) 212 base_addr = XSAVE_AREA_ZMM_BEGIN; 213 for (int n = 0; n < 16; n++) { 214 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n)); 215 } 216 // Save full ZMM registers(16..num_xmm_regs) 217 base_addr = XSAVE_AREA_UPPERBANK; 218 off = 0; 219 int vector_len = Assembler::AVX_512bit; 220 for (int n = 16; n < num_xmm_regs; n++) { 221 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 222 } 223 #if COMPILER2_OR_JVMCI 224 base_addr = XSAVE_AREA_OPMASK_BEGIN; 225 off = 0; 226 for(int n = 0; n < KRegisterImpl::number_of_registers; n++) { 227 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 228 } 229 #endif 230 } 231 } else { 232 if (VM_Version::supports_evex()) { 233 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 234 int base_addr = XSAVE_AREA_UPPERBANK; 235 off = 0; 236 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 237 for (int n = 16; n < num_xmm_regs; n++) { 238 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 239 } 240 #if COMPILER2_OR_JVMCI 241 base_addr = XSAVE_AREA_OPMASK_BEGIN; 242 off = 0; 243 for(int n = 0; n < KRegisterImpl::number_of_registers; n++) { 244 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 245 } 246 #endif 247 } 248 } 249 __ vzeroupper(); 250 if (frame::arg_reg_save_area_bytes != 0) { 251 // Allocate argument register save area 252 __ subptr(rsp, frame::arg_reg_save_area_bytes); 253 } 254 255 // Set an oopmap for the call site. This oopmap will map all 256 // oop-registers and debug-info registers as callee-saved. This 257 // will allow deoptimization at this safepoint to find all possible 258 // debug-info recordings, as well as let GC find all oops. 259 260 OopMapSet *oop_maps = new OopMapSet(); 261 OopMap* map = new OopMap(frame_size_in_slots, 0); 262 263 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x)) 264 265 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg()); 266 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg()); 267 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg()); 268 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg()); 269 // rbp location is known implicitly by the frame sender code, needs no oopmap 270 // and the location where rbp was saved by is ignored 271 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg()); 272 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg()); 273 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg()); 274 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg()); 275 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg()); 276 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg()); 277 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg()); 278 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg()); 279 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg()); 280 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg()); 281 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 282 // on EVEX enabled targets, we get it included in the xsave area 283 off = xmm0_off; 284 int delta = xmm1_off - off; 285 for (int n = 0; n < 16; n++) { 286 XMMRegister xmm_name = as_XMMRegister(n); 287 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()); 288 off += delta; 289 } 290 if (UseAVX > 2) { 291 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 292 off = zmm16_off; 293 delta = zmm17_off - off; 294 for (int n = 16; n < num_xmm_regs; n++) { 295 XMMRegister zmm_name = as_XMMRegister(n); 296 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()); 297 off += delta; 298 } 299 } 300 301 #if COMPILER2_OR_JVMCI 302 if (save_wide_vectors) { 303 // Save upper half of YMM registers(0..15) 304 off = ymm0_off; 305 delta = ymm1_off - ymm0_off; 306 for (int n = 0; n < 16; n++) { 307 XMMRegister ymm_name = as_XMMRegister(n); 308 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4)); 309 off += delta; 310 } 311 if (VM_Version::supports_evex()) { 312 // Save upper half of ZMM registers(0..15) 313 off = zmm0_off; 314 delta = zmm1_off - zmm0_off; 315 for (int n = 0; n < 16; n++) { 316 XMMRegister zmm_name = as_XMMRegister(n); 317 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8)); 318 off += delta; 319 } 320 } 321 } 322 #endif // COMPILER2_OR_JVMCI 323 324 // %%% These should all be a waste but we'll keep things as they were for now 325 if (true) { 326 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next()); 327 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next()); 328 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next()); 329 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next()); 330 // rbp location is known implicitly by the frame sender code, needs no oopmap 331 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next()); 332 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next()); 333 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next()); 334 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next()); 335 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next()); 336 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next()); 337 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next()); 338 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next()); 339 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next()); 340 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next()); 341 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 342 // on EVEX enabled targets, we get it included in the xsave area 343 off = xmm0H_off; 344 delta = xmm1H_off - off; 345 for (int n = 0; n < 16; n++) { 346 XMMRegister xmm_name = as_XMMRegister(n); 347 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next()); 348 off += delta; 349 } 350 if (UseAVX > 2) { 351 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 352 off = zmm16H_off; 353 delta = zmm17H_off - off; 354 for (int n = 16; n < num_xmm_regs; n++) { 355 XMMRegister zmm_name = as_XMMRegister(n); 356 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next()); 357 off += delta; 358 } 359 } 360 } 361 362 return map; 363 } 364 365 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) { 366 int num_xmm_regs = XMMRegisterImpl::number_of_registers; 367 if (UseAVX < 3) { 368 num_xmm_regs = num_xmm_regs/2; 369 } 370 if (frame::arg_reg_save_area_bytes != 0) { 371 // Pop arg register save area 372 __ addptr(rsp, frame::arg_reg_save_area_bytes); 373 } 374 375 #if COMPILER2_OR_JVMCI 376 if (restore_wide_vectors) { 377 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); 378 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 379 } 380 #else 381 assert(!restore_wide_vectors, "vectors are generated only by C2"); 382 #endif 383 384 __ vzeroupper(); 385 386 // On EVEX enabled targets everything is handled in pop fpu state 387 if (restore_wide_vectors) { 388 // Restore upper half of YMM registers (0..15) 389 int base_addr = XSAVE_AREA_YMM_BEGIN; 390 for (int n = 0; n < 16; n++) { 391 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16)); 392 } 393 if (VM_Version::supports_evex()) { 394 // Restore upper half of ZMM registers (0..15) 395 base_addr = XSAVE_AREA_ZMM_BEGIN; 396 for (int n = 0; n < 16; n++) { 397 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32)); 398 } 399 // Restore full ZMM registers(16..num_xmm_regs) 400 base_addr = XSAVE_AREA_UPPERBANK; 401 int vector_len = Assembler::AVX_512bit; 402 int off = 0; 403 for (int n = 16; n < num_xmm_regs; n++) { 404 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 405 } 406 #if COMPILER2_OR_JVMCI 407 base_addr = XSAVE_AREA_OPMASK_BEGIN; 408 off = 0; 409 for (int n = 0; n < KRegisterImpl::number_of_registers; n++) { 410 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 411 } 412 #endif 413 } 414 } else { 415 if (VM_Version::supports_evex()) { 416 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 417 int base_addr = XSAVE_AREA_UPPERBANK; 418 int off = 0; 419 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 420 for (int n = 16; n < num_xmm_regs; n++) { 421 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 422 } 423 #if COMPILER2_OR_JVMCI 424 base_addr = XSAVE_AREA_OPMASK_BEGIN; 425 off = 0; 426 for (int n = 0; n < KRegisterImpl::number_of_registers; n++) { 427 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 428 } 429 #endif 430 } 431 } 432 433 // Recover CPU state 434 __ pop_CPU_state(); 435 // Get the rbp described implicitly by the calling convention (no oopMap) 436 __ pop(rbp); 437 } 438 439 void RegisterSaver::restore_result_registers(MacroAssembler* masm) { 440 441 // Just restore result register. Only used by deoptimization. By 442 // now any callee save register that needs to be restored to a c2 443 // caller of the deoptee has been extracted into the vframeArray 444 // and will be stuffed into the c2i adapter we create for later 445 // restoration so only result registers need to be restored here. 446 447 // Restore fp result register 448 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes())); 449 // Restore integer result register 450 __ movptr(rax, Address(rsp, rax_offset_in_bytes())); 451 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes())); 452 453 // Pop all of the register save are off the stack except the return address 454 __ addptr(rsp, return_offset_in_bytes()); 455 } 456 457 // Is vector's size (in bytes) bigger than a size saved by default? 458 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions. 459 bool SharedRuntime::is_wide_vector(int size) { 460 return size > 16; 461 } 462 463 // --------------------------------------------------------------------------- 464 // Read the array of BasicTypes from a signature, and compute where the 465 // arguments should go. Values in the VMRegPair regs array refer to 4-byte 466 // quantities. Values less than VMRegImpl::stack0 are registers, those above 467 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer 468 // as framesizes are fixed. 469 // VMRegImpl::stack0 refers to the first slot 0(sp). 470 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher. Register 471 // up to RegisterImpl::number_of_registers) are the 64-bit 472 // integer registers. 473 474 // Note: the INPUTS in sig_bt are in units of Java argument words, which are 475 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit 476 // units regardless of build. Of course for i486 there is no 64 bit build 477 478 // The Java calling convention is a "shifted" version of the C ABI. 479 // By skipping the first C ABI register we can call non-static jni methods 480 // with small numbers of arguments without having to shuffle the arguments 481 // at all. Since we control the java ABI we ought to at least get some 482 // advantage out of it. 483 484 int SharedRuntime::java_calling_convention(const BasicType *sig_bt, 485 VMRegPair *regs, 486 int total_args_passed) { 487 488 // Create the mapping between argument positions and 489 // registers. 490 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = { 491 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5 492 }; 493 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = { 494 j_farg0, j_farg1, j_farg2, j_farg3, 495 j_farg4, j_farg5, j_farg6, j_farg7 496 }; 497 498 499 uint int_args = 0; 500 uint fp_args = 0; 501 uint stk_args = 0; // inc by 2 each time 502 503 for (int i = 0; i < total_args_passed; i++) { 504 switch (sig_bt[i]) { 505 case T_BOOLEAN: 506 case T_CHAR: 507 case T_BYTE: 508 case T_SHORT: 509 case T_INT: 510 if (int_args < Argument::n_int_register_parameters_j) { 511 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 512 } else { 513 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 514 stk_args += 2; 515 } 516 break; 517 case T_VOID: 518 // halves of T_LONG or T_DOUBLE 519 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 520 regs[i].set_bad(); 521 break; 522 case T_LONG: 523 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 524 // fall through 525 case T_OBJECT: 526 case T_ARRAY: 527 case T_ADDRESS: 528 if (int_args < Argument::n_int_register_parameters_j) { 529 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 530 } else { 531 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 532 stk_args += 2; 533 } 534 break; 535 case T_FLOAT: 536 if (fp_args < Argument::n_float_register_parameters_j) { 537 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 538 } else { 539 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 540 stk_args += 2; 541 } 542 break; 543 case T_DOUBLE: 544 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 545 if (fp_args < Argument::n_float_register_parameters_j) { 546 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 547 } else { 548 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 549 stk_args += 2; 550 } 551 break; 552 default: 553 ShouldNotReachHere(); 554 break; 555 } 556 } 557 558 return align_up(stk_args, 2); 559 } 560 561 // Patch the callers callsite with entry to compiled code if it exists. 562 static void patch_callers_callsite(MacroAssembler *masm) { 563 Label L; 564 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD); 565 __ jcc(Assembler::equal, L); 566 567 // Save the current stack pointer 568 __ mov(r13, rsp); 569 // Schedule the branch target address early. 570 // Call into the VM to patch the caller, then jump to compiled callee 571 // rax isn't live so capture return address while we easily can 572 __ movptr(rax, Address(rsp, 0)); 573 574 // align stack so push_CPU_state doesn't fault 575 __ andptr(rsp, -(StackAlignmentInBytes)); 576 __ push_CPU_state(); 577 __ vzeroupper(); 578 // VM needs caller's callsite 579 // VM needs target method 580 // This needs to be a long call since we will relocate this adapter to 581 // the codeBuffer and it may not reach 582 583 // Allocate argument register save area 584 if (frame::arg_reg_save_area_bytes != 0) { 585 __ subptr(rsp, frame::arg_reg_save_area_bytes); 586 } 587 __ mov(c_rarg0, rbx); 588 __ mov(c_rarg1, rax); 589 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite))); 590 591 // De-allocate argument register save area 592 if (frame::arg_reg_save_area_bytes != 0) { 593 __ addptr(rsp, frame::arg_reg_save_area_bytes); 594 } 595 596 __ vzeroupper(); 597 __ pop_CPU_state(); 598 // restore sp 599 __ mov(rsp, r13); 600 __ bind(L); 601 } 602 603 604 static void gen_c2i_adapter(MacroAssembler *masm, 605 int total_args_passed, 606 int comp_args_on_stack, 607 const BasicType *sig_bt, 608 const VMRegPair *regs, 609 Label& skip_fixup) { 610 // Before we get into the guts of the C2I adapter, see if we should be here 611 // at all. We've come from compiled code and are attempting to jump to the 612 // interpreter, which means the caller made a static call to get here 613 // (vcalls always get a compiled target if there is one). Check for a 614 // compiled target. If there is one, we need to patch the caller's call. 615 patch_callers_callsite(masm); 616 617 __ bind(skip_fixup); 618 619 // Since all args are passed on the stack, total_args_passed * 620 // Interpreter::stackElementSize is the space we need. Plus 1 because 621 // we also account for the return address location since 622 // we store it first rather than hold it in rax across all the shuffling 623 624 int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize; 625 626 // stack is aligned, keep it that way 627 extraspace = align_up(extraspace, 2*wordSize); 628 629 // Get return address 630 __ pop(rax); 631 632 // set senderSP value 633 __ mov(r13, rsp); 634 635 __ subptr(rsp, extraspace); 636 637 // Store the return address in the expected location 638 __ movptr(Address(rsp, 0), rax); 639 640 // Now write the args into the outgoing interpreter space 641 for (int i = 0; i < total_args_passed; i++) { 642 if (sig_bt[i] == T_VOID) { 643 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 644 continue; 645 } 646 647 // offset to start parameters 648 int st_off = (total_args_passed - i) * Interpreter::stackElementSize; 649 int next_off = st_off - Interpreter::stackElementSize; 650 651 // Say 4 args: 652 // i st_off 653 // 0 32 T_LONG 654 // 1 24 T_VOID 655 // 2 16 T_OBJECT 656 // 3 8 T_BOOL 657 // - 0 return address 658 // 659 // However to make thing extra confusing. Because we can fit a long/double in 660 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter 661 // leaves one slot empty and only stores to a single slot. In this case the 662 // slot that is occupied is the T_VOID slot. See I said it was confusing. 663 664 VMReg r_1 = regs[i].first(); 665 VMReg r_2 = regs[i].second(); 666 if (!r_1->is_valid()) { 667 assert(!r_2->is_valid(), ""); 668 continue; 669 } 670 if (r_1->is_stack()) { 671 // memory to memory use rax 672 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace; 673 if (!r_2->is_valid()) { 674 // sign extend?? 675 __ movl(rax, Address(rsp, ld_off)); 676 __ movptr(Address(rsp, st_off), rax); 677 678 } else { 679 680 __ movq(rax, Address(rsp, ld_off)); 681 682 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 683 // T_DOUBLE and T_LONG use two slots in the interpreter 684 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 685 // ld_off == LSW, ld_off+wordSize == MSW 686 // st_off == MSW, next_off == LSW 687 __ movq(Address(rsp, next_off), rax); 688 #ifdef ASSERT 689 // Overwrite the unused slot with known junk 690 __ mov64(rax, CONST64(0xdeadffffdeadaaaa)); 691 __ movptr(Address(rsp, st_off), rax); 692 #endif /* ASSERT */ 693 } else { 694 __ movq(Address(rsp, st_off), rax); 695 } 696 } 697 } else if (r_1->is_Register()) { 698 Register r = r_1->as_Register(); 699 if (!r_2->is_valid()) { 700 // must be only an int (or less ) so move only 32bits to slot 701 // why not sign extend?? 702 __ movl(Address(rsp, st_off), r); 703 } else { 704 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 705 // T_DOUBLE and T_LONG use two slots in the interpreter 706 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 707 // long/double in gpr 708 #ifdef ASSERT 709 // Overwrite the unused slot with known junk 710 __ mov64(rax, CONST64(0xdeadffffdeadaaab)); 711 __ movptr(Address(rsp, st_off), rax); 712 #endif /* ASSERT */ 713 __ movq(Address(rsp, next_off), r); 714 } else { 715 __ movptr(Address(rsp, st_off), r); 716 } 717 } 718 } else { 719 assert(r_1->is_XMMRegister(), ""); 720 if (!r_2->is_valid()) { 721 // only a float use just part of the slot 722 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister()); 723 } else { 724 #ifdef ASSERT 725 // Overwrite the unused slot with known junk 726 __ mov64(rax, CONST64(0xdeadffffdeadaaac)); 727 __ movptr(Address(rsp, st_off), rax); 728 #endif /* ASSERT */ 729 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister()); 730 } 731 } 732 } 733 734 // Schedule the branch target address early. 735 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset()))); 736 __ jmp(rcx); 737 } 738 739 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg, 740 address code_start, address code_end, 741 Label& L_ok) { 742 Label L_fail; 743 __ lea(temp_reg, ExternalAddress(code_start)); 744 __ cmpptr(pc_reg, temp_reg); 745 __ jcc(Assembler::belowEqual, L_fail); 746 __ lea(temp_reg, ExternalAddress(code_end)); 747 __ cmpptr(pc_reg, temp_reg); 748 __ jcc(Assembler::below, L_ok); 749 __ bind(L_fail); 750 } 751 752 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm, 753 int total_args_passed, 754 int comp_args_on_stack, 755 const BasicType *sig_bt, 756 const VMRegPair *regs) { 757 758 // Note: r13 contains the senderSP on entry. We must preserve it since 759 // we may do a i2c -> c2i transition if we lose a race where compiled 760 // code goes non-entrant while we get args ready. 761 // In addition we use r13 to locate all the interpreter args as 762 // we must align the stack to 16 bytes on an i2c entry else we 763 // lose alignment we expect in all compiled code and register 764 // save code can segv when fxsave instructions find improperly 765 // aligned stack pointer. 766 767 // Adapters can be frameless because they do not require the caller 768 // to perform additional cleanup work, such as correcting the stack pointer. 769 // An i2c adapter is frameless because the *caller* frame, which is interpreted, 770 // routinely repairs its own stack pointer (from interpreter_frame_last_sp), 771 // even if a callee has modified the stack pointer. 772 // A c2i adapter is frameless because the *callee* frame, which is interpreted, 773 // routinely repairs its caller's stack pointer (from sender_sp, which is set 774 // up via the senderSP register). 775 // In other words, if *either* the caller or callee is interpreted, we can 776 // get the stack pointer repaired after a call. 777 // This is why c2i and i2c adapters cannot be indefinitely composed. 778 // In particular, if a c2i adapter were to somehow call an i2c adapter, 779 // both caller and callee would be compiled methods, and neither would 780 // clean up the stack pointer changes performed by the two adapters. 781 // If this happens, control eventually transfers back to the compiled 782 // caller, but with an uncorrected stack, causing delayed havoc. 783 784 // Pick up the return address 785 __ movptr(rax, Address(rsp, 0)); 786 787 if (VerifyAdapterCalls && 788 (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) { 789 // So, let's test for cascading c2i/i2c adapters right now. 790 // assert(Interpreter::contains($return_addr) || 791 // StubRoutines::contains($return_addr), 792 // "i2c adapter must return to an interpreter frame"); 793 __ block_comment("verify_i2c { "); 794 Label L_ok; 795 if (Interpreter::code() != NULL) 796 range_check(masm, rax, r11, 797 Interpreter::code()->code_start(), Interpreter::code()->code_end(), 798 L_ok); 799 if (StubRoutines::code1() != NULL) 800 range_check(masm, rax, r11, 801 StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(), 802 L_ok); 803 if (StubRoutines::code2() != NULL) 804 range_check(masm, rax, r11, 805 StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(), 806 L_ok); 807 const char* msg = "i2c adapter must return to an interpreter frame"; 808 __ block_comment(msg); 809 __ stop(msg); 810 __ bind(L_ok); 811 __ block_comment("} verify_i2ce "); 812 } 813 814 // Must preserve original SP for loading incoming arguments because 815 // we need to align the outgoing SP for compiled code. 816 __ movptr(r11, rsp); 817 818 // Cut-out for having no stack args. Since up to 2 int/oop args are passed 819 // in registers, we will occasionally have no stack args. 820 int comp_words_on_stack = 0; 821 if (comp_args_on_stack) { 822 // Sig words on the stack are greater-than VMRegImpl::stack0. Those in 823 // registers are below. By subtracting stack0, we either get a negative 824 // number (all values in registers) or the maximum stack slot accessed. 825 826 // Convert 4-byte c2 stack slots to words. 827 comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord; 828 // Round up to miminum stack alignment, in wordSize 829 comp_words_on_stack = align_up(comp_words_on_stack, 2); 830 __ subptr(rsp, comp_words_on_stack * wordSize); 831 } 832 833 834 // Ensure compiled code always sees stack at proper alignment 835 __ andptr(rsp, -16); 836 837 // push the return address and misalign the stack that youngest frame always sees 838 // as far as the placement of the call instruction 839 __ push(rax); 840 841 // Put saved SP in another register 842 const Register saved_sp = rax; 843 __ movptr(saved_sp, r11); 844 845 // Will jump to the compiled code just as if compiled code was doing it. 846 // Pre-load the register-jump target early, to schedule it better. 847 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset()))); 848 849 #if INCLUDE_JVMCI 850 if (EnableJVMCI) { 851 // check if this call should be routed towards a specific entry point 852 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 853 Label no_alternative_target; 854 __ jcc(Assembler::equal, no_alternative_target); 855 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset()))); 856 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 857 __ bind(no_alternative_target); 858 } 859 #endif // INCLUDE_JVMCI 860 861 // Now generate the shuffle code. Pick up all register args and move the 862 // rest through the floating point stack top. 863 for (int i = 0; i < total_args_passed; i++) { 864 if (sig_bt[i] == T_VOID) { 865 // Longs and doubles are passed in native word order, but misaligned 866 // in the 32-bit build. 867 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 868 continue; 869 } 870 871 // Pick up 0, 1 or 2 words from SP+offset. 872 873 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), 874 "scrambled load targets?"); 875 // Load in argument order going down. 876 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize; 877 // Point to interpreter value (vs. tag) 878 int next_off = ld_off - Interpreter::stackElementSize; 879 // 880 // 881 // 882 VMReg r_1 = regs[i].first(); 883 VMReg r_2 = regs[i].second(); 884 if (!r_1->is_valid()) { 885 assert(!r_2->is_valid(), ""); 886 continue; 887 } 888 if (r_1->is_stack()) { 889 // Convert stack slot to an SP offset (+ wordSize to account for return address ) 890 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize; 891 892 // We can use r13 as a temp here because compiled code doesn't need r13 as an input 893 // and if we end up going thru a c2i because of a miss a reasonable value of r13 894 // will be generated. 895 if (!r_2->is_valid()) { 896 // sign extend??? 897 __ movl(r13, Address(saved_sp, ld_off)); 898 __ movptr(Address(rsp, st_off), r13); 899 } else { 900 // 901 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 902 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 903 // So we must adjust where to pick up the data to match the interpreter. 904 // 905 // Interpreter local[n] == MSW, local[n+1] == LSW however locals 906 // are accessed as negative so LSW is at LOW address 907 908 // ld_off is MSW so get LSW 909 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 910 next_off : ld_off; 911 __ movq(r13, Address(saved_sp, offset)); 912 // st_off is LSW (i.e. reg.first()) 913 __ movq(Address(rsp, st_off), r13); 914 } 915 } else if (r_1->is_Register()) { // Register argument 916 Register r = r_1->as_Register(); 917 assert(r != rax, "must be different"); 918 if (r_2->is_valid()) { 919 // 920 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 921 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 922 // So we must adjust where to pick up the data to match the interpreter. 923 924 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 925 next_off : ld_off; 926 927 // this can be a misaligned move 928 __ movq(r, Address(saved_sp, offset)); 929 } else { 930 // sign extend and use a full word? 931 __ movl(r, Address(saved_sp, ld_off)); 932 } 933 } else { 934 if (!r_2->is_valid()) { 935 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off)); 936 } else { 937 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off)); 938 } 939 } 940 } 941 942 // 6243940 We might end up in handle_wrong_method if 943 // the callee is deoptimized as we race thru here. If that 944 // happens we don't want to take a safepoint because the 945 // caller frame will look interpreted and arguments are now 946 // "compiled" so it is much better to make this transition 947 // invisible to the stack walking code. Unfortunately if 948 // we try and find the callee by normal means a safepoint 949 // is possible. So we stash the desired callee in the thread 950 // and the vm will find there should this case occur. 951 952 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx); 953 954 // put Method* where a c2i would expect should we end up there 955 // only needed becaus eof c2 resolve stubs return Method* as a result in 956 // rax 957 __ mov(rax, rbx); 958 __ jmp(r11); 959 } 960 961 // --------------------------------------------------------------- 962 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm, 963 int total_args_passed, 964 int comp_args_on_stack, 965 const BasicType *sig_bt, 966 const VMRegPair *regs, 967 AdapterFingerPrint* fingerprint) { 968 address i2c_entry = __ pc(); 969 970 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs); 971 972 // ------------------------------------------------------------------------- 973 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls 974 // to the interpreter. The args start out packed in the compiled layout. They 975 // need to be unpacked into the interpreter layout. This will almost always 976 // require some stack space. We grow the current (compiled) stack, then repack 977 // the args. We finally end in a jump to the generic interpreter entry point. 978 // On exit from the interpreter, the interpreter will restore our SP (lest the 979 // compiled code, which relys solely on SP and not RBP, get sick). 980 981 address c2i_unverified_entry = __ pc(); 982 Label skip_fixup; 983 Label ok; 984 985 Register holder = rax; 986 Register receiver = j_rarg0; 987 Register temp = rbx; 988 989 { 990 __ load_klass(temp, receiver, rscratch1); 991 __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset())); 992 __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset())); 993 __ jcc(Assembler::equal, ok); 994 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 995 996 __ bind(ok); 997 // Method might have been compiled since the call site was patched to 998 // interpreted if that is the case treat it as a miss so we can get 999 // the call site corrected. 1000 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD); 1001 __ jcc(Assembler::equal, skip_fixup); 1002 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1003 } 1004 1005 address c2i_entry = __ pc(); 1006 1007 // Class initialization barrier for static methods 1008 address c2i_no_clinit_check_entry = NULL; 1009 if (VM_Version::supports_fast_class_init_checks()) { 1010 Label L_skip_barrier; 1011 Register method = rbx; 1012 1013 { // Bypass the barrier for non-static methods 1014 Register flags = rscratch1; 1015 __ movl(flags, Address(method, Method::access_flags_offset())); 1016 __ testl(flags, JVM_ACC_STATIC); 1017 __ jcc(Assembler::zero, L_skip_barrier); // non-static 1018 } 1019 1020 Register klass = rscratch1; 1021 __ load_method_holder(klass, method); 1022 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1023 1024 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1025 1026 __ bind(L_skip_barrier); 1027 c2i_no_clinit_check_entry = __ pc(); 1028 } 1029 1030 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1031 bs->c2i_entry_barrier(masm); 1032 1033 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup); 1034 1035 __ flush(); 1036 return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry); 1037 } 1038 1039 int SharedRuntime::c_calling_convention(const BasicType *sig_bt, 1040 VMRegPair *regs, 1041 VMRegPair *regs2, 1042 int total_args_passed) { 1043 assert(regs2 == NULL, "not needed on x86"); 1044 // We return the amount of VMRegImpl stack slots we need to reserve for all 1045 // the arguments NOT counting out_preserve_stack_slots. 1046 1047 // NOTE: These arrays will have to change when c1 is ported 1048 #ifdef _WIN64 1049 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1050 c_rarg0, c_rarg1, c_rarg2, c_rarg3 1051 }; 1052 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1053 c_farg0, c_farg1, c_farg2, c_farg3 1054 }; 1055 #else 1056 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1057 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5 1058 }; 1059 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1060 c_farg0, c_farg1, c_farg2, c_farg3, 1061 c_farg4, c_farg5, c_farg6, c_farg7 1062 }; 1063 #endif // _WIN64 1064 1065 1066 uint int_args = 0; 1067 uint fp_args = 0; 1068 uint stk_args = 0; // inc by 2 each time 1069 1070 for (int i = 0; i < total_args_passed; i++) { 1071 switch (sig_bt[i]) { 1072 case T_BOOLEAN: 1073 case T_CHAR: 1074 case T_BYTE: 1075 case T_SHORT: 1076 case T_INT: 1077 if (int_args < Argument::n_int_register_parameters_c) { 1078 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 1079 #ifdef _WIN64 1080 fp_args++; 1081 // Allocate slots for callee to stuff register args the stack. 1082 stk_args += 2; 1083 #endif 1084 } else { 1085 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1086 stk_args += 2; 1087 } 1088 break; 1089 case T_LONG: 1090 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1091 // fall through 1092 case T_OBJECT: 1093 case T_ARRAY: 1094 case T_ADDRESS: 1095 case T_METADATA: 1096 if (int_args < Argument::n_int_register_parameters_c) { 1097 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 1098 #ifdef _WIN64 1099 fp_args++; 1100 stk_args += 2; 1101 #endif 1102 } else { 1103 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1104 stk_args += 2; 1105 } 1106 break; 1107 case T_FLOAT: 1108 if (fp_args < Argument::n_float_register_parameters_c) { 1109 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 1110 #ifdef _WIN64 1111 int_args++; 1112 // Allocate slots for callee to stuff register args the stack. 1113 stk_args += 2; 1114 #endif 1115 } else { 1116 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1117 stk_args += 2; 1118 } 1119 break; 1120 case T_DOUBLE: 1121 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1122 if (fp_args < Argument::n_float_register_parameters_c) { 1123 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 1124 #ifdef _WIN64 1125 int_args++; 1126 // Allocate slots for callee to stuff register args the stack. 1127 stk_args += 2; 1128 #endif 1129 } else { 1130 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1131 stk_args += 2; 1132 } 1133 break; 1134 case T_VOID: // Halves of longs and doubles 1135 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 1136 regs[i].set_bad(); 1137 break; 1138 default: 1139 ShouldNotReachHere(); 1140 break; 1141 } 1142 } 1143 #ifdef _WIN64 1144 // windows abi requires that we always allocate enough stack space 1145 // for 4 64bit registers to be stored down. 1146 if (stk_args < 8) { 1147 stk_args = 8; 1148 } 1149 #endif // _WIN64 1150 1151 return stk_args; 1152 } 1153 1154 int SharedRuntime::vector_calling_convention(VMRegPair *regs, 1155 uint num_bits, 1156 uint total_args_passed) { 1157 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512, 1158 "only certain vector sizes are supported for now"); 1159 1160 static const XMMRegister VEC_ArgReg[32] = { 1161 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 1162 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1163 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23, 1164 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31 1165 }; 1166 1167 uint stk_args = 0; 1168 uint fp_args = 0; 1169 1170 for (uint i = 0; i < total_args_passed; i++) { 1171 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg(); 1172 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15)); 1173 regs[i].set_pair(vmreg->next(next_val), vmreg); 1174 } 1175 1176 return stk_args; 1177 } 1178 1179 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1180 // We always ignore the frame_slots arg and just use the space just below frame pointer 1181 // which by this time is free to use 1182 switch (ret_type) { 1183 case T_FLOAT: 1184 __ movflt(Address(rbp, -wordSize), xmm0); 1185 break; 1186 case T_DOUBLE: 1187 __ movdbl(Address(rbp, -wordSize), xmm0); 1188 break; 1189 case T_VOID: break; 1190 default: { 1191 __ movptr(Address(rbp, -wordSize), rax); 1192 } 1193 } 1194 } 1195 1196 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1197 // We always ignore the frame_slots arg and just use the space just below frame pointer 1198 // which by this time is free to use 1199 switch (ret_type) { 1200 case T_FLOAT: 1201 __ movflt(xmm0, Address(rbp, -wordSize)); 1202 break; 1203 case T_DOUBLE: 1204 __ movdbl(xmm0, Address(rbp, -wordSize)); 1205 break; 1206 case T_VOID: break; 1207 default: { 1208 __ movptr(rax, Address(rbp, -wordSize)); 1209 } 1210 } 1211 } 1212 1213 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1214 for ( int i = first_arg ; i < arg_count ; i++ ) { 1215 if (args[i].first()->is_Register()) { 1216 __ push(args[i].first()->as_Register()); 1217 } else if (args[i].first()->is_XMMRegister()) { 1218 __ subptr(rsp, 2*wordSize); 1219 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister()); 1220 } 1221 } 1222 } 1223 1224 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1225 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) { 1226 if (args[i].first()->is_Register()) { 1227 __ pop(args[i].first()->as_Register()); 1228 } else if (args[i].first()->is_XMMRegister()) { 1229 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0)); 1230 __ addptr(rsp, 2*wordSize); 1231 } 1232 } 1233 } 1234 1235 // Unpack an array argument into a pointer to the body and the length 1236 // if the array is non-null, otherwise pass 0 for both. 1237 static void unpack_array_argument(MacroAssembler* masm, VMRegPair reg, BasicType in_elem_type, VMRegPair body_arg, VMRegPair length_arg) { 1238 Register tmp_reg = rax; 1239 assert(!body_arg.first()->is_Register() || body_arg.first()->as_Register() != tmp_reg, 1240 "possible collision"); 1241 assert(!length_arg.first()->is_Register() || length_arg.first()->as_Register() != tmp_reg, 1242 "possible collision"); 1243 1244 __ block_comment("unpack_array_argument {"); 1245 1246 // Pass the length, ptr pair 1247 Label is_null, done; 1248 VMRegPair tmp; 1249 tmp.set_ptr(tmp_reg->as_VMReg()); 1250 if (reg.first()->is_stack()) { 1251 // Load the arg up from the stack 1252 __ move_ptr(reg, tmp); 1253 reg = tmp; 1254 } 1255 __ testptr(reg.first()->as_Register(), reg.first()->as_Register()); 1256 __ jccb(Assembler::equal, is_null); 1257 __ lea(tmp_reg, Address(reg.first()->as_Register(), arrayOopDesc::base_offset_in_bytes(in_elem_type))); 1258 __ move_ptr(tmp, body_arg); 1259 // load the length relative to the body. 1260 __ movl(tmp_reg, Address(tmp_reg, arrayOopDesc::length_offset_in_bytes() - 1261 arrayOopDesc::base_offset_in_bytes(in_elem_type))); 1262 __ move32_64(tmp, length_arg); 1263 __ jmpb(done); 1264 __ bind(is_null); 1265 // Pass zeros 1266 __ xorptr(tmp_reg, tmp_reg); 1267 __ move_ptr(tmp, body_arg); 1268 __ move32_64(tmp, length_arg); 1269 __ bind(done); 1270 1271 __ block_comment("} unpack_array_argument"); 1272 } 1273 1274 1275 // Different signatures may require very different orders for the move 1276 // to avoid clobbering other arguments. There's no simple way to 1277 // order them safely. Compute a safe order for issuing stores and 1278 // break any cycles in those stores. This code is fairly general but 1279 // it's not necessary on the other platforms so we keep it in the 1280 // platform dependent code instead of moving it into a shared file. 1281 // (See bugs 7013347 & 7145024.) 1282 // Note that this code is specific to LP64. 1283 class ComputeMoveOrder: public StackObj { 1284 class MoveOperation: public ResourceObj { 1285 friend class ComputeMoveOrder; 1286 private: 1287 VMRegPair _src; 1288 VMRegPair _dst; 1289 int _src_index; 1290 int _dst_index; 1291 bool _processed; 1292 MoveOperation* _next; 1293 MoveOperation* _prev; 1294 1295 static int get_id(VMRegPair r) { 1296 return r.first()->value(); 1297 } 1298 1299 public: 1300 MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst): 1301 _src(src) 1302 , _dst(dst) 1303 , _src_index(src_index) 1304 , _dst_index(dst_index) 1305 , _processed(false) 1306 , _next(NULL) 1307 , _prev(NULL) { 1308 } 1309 1310 VMRegPair src() const { return _src; } 1311 int src_id() const { return get_id(src()); } 1312 int src_index() const { return _src_index; } 1313 VMRegPair dst() const { return _dst; } 1314 void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; } 1315 int dst_index() const { return _dst_index; } 1316 int dst_id() const { return get_id(dst()); } 1317 MoveOperation* next() const { return _next; } 1318 MoveOperation* prev() const { return _prev; } 1319 void set_processed() { _processed = true; } 1320 bool is_processed() const { return _processed; } 1321 1322 // insert 1323 void break_cycle(VMRegPair temp_register) { 1324 // create a new store following the last store 1325 // to move from the temp_register to the original 1326 MoveOperation* new_store = new MoveOperation(-1, temp_register, dst_index(), dst()); 1327 1328 // break the cycle of links and insert new_store at the end 1329 // break the reverse link. 1330 MoveOperation* p = prev(); 1331 assert(p->next() == this, "must be"); 1332 _prev = NULL; 1333 p->_next = new_store; 1334 new_store->_prev = p; 1335 1336 // change the original store to save it's value in the temp. 1337 set_dst(-1, temp_register); 1338 } 1339 1340 void link(GrowableArray<MoveOperation*>& killer) { 1341 // link this store in front the store that it depends on 1342 MoveOperation* n = killer.at_grow(src_id(), NULL); 1343 if (n != NULL) { 1344 assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet"); 1345 _next = n; 1346 n->_prev = this; 1347 } 1348 } 1349 }; 1350 1351 private: 1352 GrowableArray<MoveOperation*> edges; 1353 1354 public: 1355 ComputeMoveOrder(int total_in_args, const VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs, 1356 const BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) { 1357 // Move operations where the dest is the stack can all be 1358 // scheduled first since they can't interfere with the other moves. 1359 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 1360 if (in_sig_bt[i] == T_ARRAY) { 1361 c_arg--; 1362 if (out_regs[c_arg].first()->is_stack() && 1363 out_regs[c_arg + 1].first()->is_stack()) { 1364 arg_order.push(i); 1365 arg_order.push(c_arg); 1366 } else { 1367 if (out_regs[c_arg].first()->is_stack() || 1368 in_regs[i].first() == out_regs[c_arg].first()) { 1369 add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + 1]); 1370 } else { 1371 add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]); 1372 } 1373 } 1374 } else if (in_sig_bt[i] == T_VOID) { 1375 arg_order.push(i); 1376 arg_order.push(c_arg); 1377 } else { 1378 if (out_regs[c_arg].first()->is_stack() || 1379 in_regs[i].first() == out_regs[c_arg].first()) { 1380 arg_order.push(i); 1381 arg_order.push(c_arg); 1382 } else { 1383 add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]); 1384 } 1385 } 1386 } 1387 // Break any cycles in the register moves and emit the in the 1388 // proper order. 1389 GrowableArray<MoveOperation*>* stores = get_store_order(tmp_vmreg); 1390 for (int i = 0; i < stores->length(); i++) { 1391 arg_order.push(stores->at(i)->src_index()); 1392 arg_order.push(stores->at(i)->dst_index()); 1393 } 1394 } 1395 1396 // Collected all the move operations 1397 void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) { 1398 if (src.first() == dst.first()) return; 1399 edges.append(new MoveOperation(src_index, src, dst_index, dst)); 1400 } 1401 1402 // Walk the edges breaking cycles between moves. The result list 1403 // can be walked in order to produce the proper set of loads 1404 GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) { 1405 // Record which moves kill which values 1406 GrowableArray<MoveOperation*> killer; 1407 for (int i = 0; i < edges.length(); i++) { 1408 MoveOperation* s = edges.at(i); 1409 assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer"); 1410 killer.at_put_grow(s->dst_id(), s, NULL); 1411 } 1412 assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL, 1413 "make sure temp isn't in the registers that are killed"); 1414 1415 // create links between loads and stores 1416 for (int i = 0; i < edges.length(); i++) { 1417 edges.at(i)->link(killer); 1418 } 1419 1420 // at this point, all the move operations are chained together 1421 // in a doubly linked list. Processing it backwards finds 1422 // the beginning of the chain, forwards finds the end. If there's 1423 // a cycle it can be broken at any point, so pick an edge and walk 1424 // backward until the list ends or we end where we started. 1425 GrowableArray<MoveOperation*>* stores = new GrowableArray<MoveOperation*>(); 1426 for (int e = 0; e < edges.length(); e++) { 1427 MoveOperation* s = edges.at(e); 1428 if (!s->is_processed()) { 1429 MoveOperation* start = s; 1430 // search for the beginning of the chain or cycle 1431 while (start->prev() != NULL && start->prev() != s) { 1432 start = start->prev(); 1433 } 1434 if (start->prev() == s) { 1435 start->break_cycle(temp_register); 1436 } 1437 // walk the chain forward inserting to store list 1438 while (start != NULL) { 1439 stores->append(start); 1440 start->set_processed(); 1441 start = start->next(); 1442 } 1443 } 1444 } 1445 return stores; 1446 } 1447 }; 1448 1449 static void verify_oop_args(MacroAssembler* masm, 1450 const methodHandle& method, 1451 const BasicType* sig_bt, 1452 const VMRegPair* regs) { 1453 Register temp_reg = rbx; // not part of any compiled calling seq 1454 if (VerifyOops) { 1455 for (int i = 0; i < method->size_of_parameters(); i++) { 1456 if (is_reference_type(sig_bt[i])) { 1457 VMReg r = regs[i].first(); 1458 assert(r->is_valid(), "bad oop arg"); 1459 if (r->is_stack()) { 1460 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1461 __ verify_oop(temp_reg); 1462 } else { 1463 __ verify_oop(r->as_Register()); 1464 } 1465 } 1466 } 1467 } 1468 } 1469 1470 static void gen_special_dispatch(MacroAssembler* masm, 1471 const methodHandle& method, 1472 const BasicType* sig_bt, 1473 const VMRegPair* regs) { 1474 verify_oop_args(masm, method, sig_bt, regs); 1475 vmIntrinsics::ID iid = method->intrinsic_id(); 1476 1477 // Now write the args into the outgoing interpreter space 1478 bool has_receiver = false; 1479 Register receiver_reg = noreg; 1480 int member_arg_pos = -1; 1481 Register member_reg = noreg; 1482 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid); 1483 if (ref_kind != 0) { 1484 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument 1485 member_reg = rbx; // known to be free at this point 1486 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind); 1487 } else if (iid == vmIntrinsics::_invokeBasic || iid == vmIntrinsics::_linkToNative) { 1488 has_receiver = true; 1489 } else { 1490 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid)); 1491 } 1492 1493 if (member_reg != noreg) { 1494 // Load the member_arg into register, if necessary. 1495 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs); 1496 VMReg r = regs[member_arg_pos].first(); 1497 if (r->is_stack()) { 1498 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1499 } else { 1500 // no data motion is needed 1501 member_reg = r->as_Register(); 1502 } 1503 } 1504 1505 if (has_receiver) { 1506 // Make sure the receiver is loaded into a register. 1507 assert(method->size_of_parameters() > 0, "oob"); 1508 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object"); 1509 VMReg r = regs[0].first(); 1510 assert(r->is_valid(), "bad receiver arg"); 1511 if (r->is_stack()) { 1512 // Porting note: This assumes that compiled calling conventions always 1513 // pass the receiver oop in a register. If this is not true on some 1514 // platform, pick a temp and load the receiver from stack. 1515 fatal("receiver always in a register"); 1516 receiver_reg = j_rarg0; // known to be free at this point 1517 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1518 } else { 1519 // no data motion is needed 1520 receiver_reg = r->as_Register(); 1521 } 1522 } 1523 1524 // Figure out which address we are really jumping to: 1525 MethodHandles::generate_method_handle_dispatch(masm, iid, 1526 receiver_reg, member_reg, /*for_compiler_entry:*/ true); 1527 } 1528 1529 // --------------------------------------------------------------------------- 1530 // Generate a native wrapper for a given method. The method takes arguments 1531 // in the Java compiled code convention, marshals them to the native 1532 // convention (handlizes oops, etc), transitions to native, makes the call, 1533 // returns to java state (possibly blocking), unhandlizes any result and 1534 // returns. 1535 // 1536 // Critical native functions are a shorthand for the use of 1537 // GetPrimtiveArrayCritical and disallow the use of any other JNI 1538 // functions. The wrapper is expected to unpack the arguments before 1539 // passing them to the callee. Critical native functions leave the state _in_Java, 1540 // since they cannot stop for GC. 1541 // Some other parts of JNI setup are skipped like the tear down of the JNI handle 1542 // block and the check for pending exceptions it's impossible for them 1543 // to be thrown. 1544 // 1545 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm, 1546 const methodHandle& method, 1547 int compile_id, 1548 BasicType* in_sig_bt, 1549 VMRegPair* in_regs, 1550 BasicType ret_type, 1551 address critical_entry) { 1552 if (method->is_method_handle_intrinsic()) { 1553 vmIntrinsics::ID iid = method->intrinsic_id(); 1554 intptr_t start = (intptr_t)__ pc(); 1555 int vep_offset = ((intptr_t)__ pc()) - start; 1556 gen_special_dispatch(masm, 1557 method, 1558 in_sig_bt, 1559 in_regs); 1560 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period 1561 __ flush(); 1562 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually 1563 return nmethod::new_native_nmethod(method, 1564 compile_id, 1565 masm->code(), 1566 vep_offset, 1567 frame_complete, 1568 stack_slots / VMRegImpl::slots_per_word, 1569 in_ByteSize(-1), 1570 in_ByteSize(-1), 1571 (OopMapSet*)NULL); 1572 } 1573 bool is_critical_native = true; 1574 address native_func = critical_entry; 1575 if (native_func == NULL) { 1576 native_func = method->native_function(); 1577 is_critical_native = false; 1578 } 1579 assert(native_func != NULL, "must have function"); 1580 1581 // An OopMap for lock (and class if static) 1582 OopMapSet *oop_maps = new OopMapSet(); 1583 intptr_t start = (intptr_t)__ pc(); 1584 1585 // We have received a description of where all the java arg are located 1586 // on entry to the wrapper. We need to convert these args to where 1587 // the jni function will expect them. To figure out where they go 1588 // we convert the java signature to a C signature by inserting 1589 // the hidden arguments as arg[0] and possibly arg[1] (static method) 1590 1591 const int total_in_args = method->size_of_parameters(); 1592 int total_c_args = total_in_args; 1593 if (!is_critical_native) { 1594 total_c_args += 1; 1595 if (method->is_static()) { 1596 total_c_args++; 1597 } 1598 } else { 1599 for (int i = 0; i < total_in_args; i++) { 1600 if (in_sig_bt[i] == T_ARRAY) { 1601 total_c_args++; 1602 } 1603 } 1604 } 1605 1606 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args); 1607 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args); 1608 BasicType* in_elem_bt = NULL; 1609 1610 int argc = 0; 1611 if (!is_critical_native) { 1612 out_sig_bt[argc++] = T_ADDRESS; 1613 if (method->is_static()) { 1614 out_sig_bt[argc++] = T_OBJECT; 1615 } 1616 1617 for (int i = 0; i < total_in_args ; i++ ) { 1618 out_sig_bt[argc++] = in_sig_bt[i]; 1619 } 1620 } else { 1621 in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_in_args); 1622 SignatureStream ss(method->signature()); 1623 for (int i = 0; i < total_in_args ; i++ ) { 1624 if (in_sig_bt[i] == T_ARRAY) { 1625 // Arrays are passed as int, elem* pair 1626 out_sig_bt[argc++] = T_INT; 1627 out_sig_bt[argc++] = T_ADDRESS; 1628 ss.skip_array_prefix(1); // skip one '[' 1629 assert(ss.is_primitive(), "primitive type expected"); 1630 in_elem_bt[i] = ss.type(); 1631 } else { 1632 out_sig_bt[argc++] = in_sig_bt[i]; 1633 in_elem_bt[i] = T_VOID; 1634 } 1635 if (in_sig_bt[i] != T_VOID) { 1636 assert(in_sig_bt[i] == ss.type() || 1637 in_sig_bt[i] == T_ARRAY, "must match"); 1638 ss.next(); 1639 } 1640 } 1641 } 1642 1643 // Now figure out where the args must be stored and how much stack space 1644 // they require. 1645 int out_arg_slots; 1646 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args); 1647 1648 // Compute framesize for the wrapper. We need to handlize all oops in 1649 // incoming registers 1650 1651 // Calculate the total number of stack slots we will need. 1652 1653 // First count the abi requirement plus all of the outgoing args 1654 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots; 1655 1656 // Now the space for the inbound oop handle area 1657 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers 1658 if (is_critical_native) { 1659 // Critical natives may have to call out so they need a save area 1660 // for register arguments. 1661 int double_slots = 0; 1662 int single_slots = 0; 1663 for ( int i = 0; i < total_in_args; i++) { 1664 if (in_regs[i].first()->is_Register()) { 1665 const Register reg = in_regs[i].first()->as_Register(); 1666 switch (in_sig_bt[i]) { 1667 case T_BOOLEAN: 1668 case T_BYTE: 1669 case T_SHORT: 1670 case T_CHAR: 1671 case T_INT: single_slots++; break; 1672 case T_ARRAY: // specific to LP64 (7145024) 1673 case T_LONG: double_slots++; break; 1674 default: ShouldNotReachHere(); 1675 } 1676 } else if (in_regs[i].first()->is_XMMRegister()) { 1677 switch (in_sig_bt[i]) { 1678 case T_FLOAT: single_slots++; break; 1679 case T_DOUBLE: double_slots++; break; 1680 default: ShouldNotReachHere(); 1681 } 1682 } else if (in_regs[i].first()->is_FloatRegister()) { 1683 ShouldNotReachHere(); 1684 } 1685 } 1686 total_save_slots = double_slots * 2 + single_slots; 1687 // align the save area 1688 if (double_slots != 0) { 1689 stack_slots = align_up(stack_slots, 2); 1690 } 1691 } 1692 1693 int oop_handle_offset = stack_slots; 1694 stack_slots += total_save_slots; 1695 1696 // Now any space we need for handlizing a klass if static method 1697 1698 int klass_slot_offset = 0; 1699 int klass_offset = -1; 1700 int lock_slot_offset = 0; 1701 bool is_static = false; 1702 1703 if (method->is_static()) { 1704 klass_slot_offset = stack_slots; 1705 stack_slots += VMRegImpl::slots_per_word; 1706 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size; 1707 is_static = true; 1708 } 1709 1710 // Plus a lock if needed 1711 1712 if (method->is_synchronized()) { 1713 lock_slot_offset = stack_slots; 1714 stack_slots += VMRegImpl::slots_per_word; 1715 } 1716 1717 // Now a place (+2) to save return values or temp during shuffling 1718 // + 4 for return address (which we own) and saved rbp 1719 stack_slots += 6; 1720 1721 // Ok The space we have allocated will look like: 1722 // 1723 // 1724 // FP-> | | 1725 // |---------------------| 1726 // | 2 slots for moves | 1727 // |---------------------| 1728 // | lock box (if sync) | 1729 // |---------------------| <- lock_slot_offset 1730 // | klass (if static) | 1731 // |---------------------| <- klass_slot_offset 1732 // | oopHandle area | 1733 // |---------------------| <- oop_handle_offset (6 java arg registers) 1734 // | outbound memory | 1735 // | based arguments | 1736 // | | 1737 // |---------------------| 1738 // | | 1739 // SP-> | out_preserved_slots | 1740 // 1741 // 1742 1743 1744 // Now compute actual number of stack words we need rounding to make 1745 // stack properly aligned. 1746 stack_slots = align_up(stack_slots, StackAlignmentInSlots); 1747 1748 int stack_size = stack_slots * VMRegImpl::stack_slot_size; 1749 1750 // First thing make an ic check to see if we should even be here 1751 1752 // We are free to use all registers as temps without saving them and 1753 // restoring them except rbp. rbp is the only callee save register 1754 // as far as the interpreter and the compiler(s) are concerned. 1755 1756 1757 const Register ic_reg = rax; 1758 const Register receiver = j_rarg0; 1759 1760 Label hit; 1761 Label exception_pending; 1762 1763 assert_different_registers(ic_reg, receiver, rscratch1); 1764 __ verify_oop(receiver); 1765 __ load_klass(rscratch1, receiver, rscratch2); 1766 __ cmpq(ic_reg, rscratch1); 1767 __ jcc(Assembler::equal, hit); 1768 1769 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1770 1771 // Verified entry point must be aligned 1772 __ align(8); 1773 1774 __ bind(hit); 1775 1776 int vep_offset = ((intptr_t)__ pc()) - start; 1777 1778 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) { 1779 Label L_skip_barrier; 1780 Register klass = r10; 1781 __ mov_metadata(klass, method->method_holder()); // InstanceKlass* 1782 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1783 1784 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1785 1786 __ bind(L_skip_barrier); 1787 } 1788 1789 #ifdef COMPILER1 1790 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available. 1791 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) { 1792 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/); 1793 } 1794 #endif // COMPILER1 1795 1796 // The instruction at the verified entry point must be 5 bytes or longer 1797 // because it can be patched on the fly by make_non_entrant. The stack bang 1798 // instruction fits that requirement. 1799 1800 // Generate stack overflow check 1801 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size()); 1802 1803 // Generate a new frame for the wrapper. 1804 __ enter(); 1805 // -2 because return address is already present and so is saved rbp 1806 __ subptr(rsp, stack_size - 2*wordSize); 1807 1808 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1809 bs->nmethod_entry_barrier(masm); 1810 1811 // Frame is now completed as far as size and linkage. 1812 int frame_complete = ((intptr_t)__ pc()) - start; 1813 1814 if (UseRTMLocking) { 1815 // Abort RTM transaction before calling JNI 1816 // because critical section will be large and will be 1817 // aborted anyway. Also nmethod could be deoptimized. 1818 __ xabort(0); 1819 } 1820 1821 #ifdef ASSERT 1822 { 1823 Label L; 1824 __ mov(rax, rsp); 1825 __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI) 1826 __ cmpptr(rax, rsp); 1827 __ jcc(Assembler::equal, L); 1828 __ stop("improperly aligned stack"); 1829 __ bind(L); 1830 } 1831 #endif /* ASSERT */ 1832 1833 1834 // We use r14 as the oop handle for the receiver/klass 1835 // It is callee save so it survives the call to native 1836 1837 const Register oop_handle_reg = r14; 1838 1839 // 1840 // We immediately shuffle the arguments so that any vm call we have to 1841 // make from here on out (sync slow path, jvmti, etc.) we will have 1842 // captured the oops from our caller and have a valid oopMap for 1843 // them. 1844 1845 // ----------------- 1846 // The Grand Shuffle 1847 1848 // The Java calling convention is either equal (linux) or denser (win64) than the 1849 // c calling convention. However the because of the jni_env argument the c calling 1850 // convention always has at least one more (and two for static) arguments than Java. 1851 // Therefore if we move the args from java -> c backwards then we will never have 1852 // a register->register conflict and we don't have to build a dependency graph 1853 // and figure out how to break any cycles. 1854 // 1855 1856 // Record esp-based slot for receiver on stack for non-static methods 1857 int receiver_offset = -1; 1858 1859 // This is a trick. We double the stack slots so we can claim 1860 // the oops in the caller's frame. Since we are sure to have 1861 // more args than the caller doubling is enough to make 1862 // sure we can capture all the incoming oop args from the 1863 // caller. 1864 // 1865 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/); 1866 1867 // Mark location of rbp (someday) 1868 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp)); 1869 1870 // Use eax, ebx as temporaries during any memory-memory moves we have to do 1871 // All inbound args are referenced based on rbp and all outbound args via rsp. 1872 1873 1874 #ifdef ASSERT 1875 bool reg_destroyed[RegisterImpl::number_of_registers]; 1876 bool freg_destroyed[XMMRegisterImpl::number_of_registers]; 1877 for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) { 1878 reg_destroyed[r] = false; 1879 } 1880 for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) { 1881 freg_destroyed[f] = false; 1882 } 1883 1884 #endif /* ASSERT */ 1885 1886 // This may iterate in two different directions depending on the 1887 // kind of native it is. The reason is that for regular JNI natives 1888 // the incoming and outgoing registers are offset upwards and for 1889 // critical natives they are offset down. 1890 GrowableArray<int> arg_order(2 * total_in_args); 1891 1892 VMRegPair tmp_vmreg; 1893 tmp_vmreg.set2(rbx->as_VMReg()); 1894 1895 if (!is_critical_native) { 1896 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 1897 arg_order.push(i); 1898 arg_order.push(c_arg); 1899 } 1900 } else { 1901 // Compute a valid move order, using tmp_vmreg to break any cycles 1902 ComputeMoveOrder cmo(total_in_args, in_regs, total_c_args, out_regs, in_sig_bt, arg_order, tmp_vmreg); 1903 } 1904 1905 int temploc = -1; 1906 for (int ai = 0; ai < arg_order.length(); ai += 2) { 1907 int i = arg_order.at(ai); 1908 int c_arg = arg_order.at(ai + 1); 1909 __ block_comment(err_msg("move %d -> %d", i, c_arg)); 1910 if (c_arg == -1) { 1911 assert(is_critical_native, "should only be required for critical natives"); 1912 // This arg needs to be moved to a temporary 1913 __ mov(tmp_vmreg.first()->as_Register(), in_regs[i].first()->as_Register()); 1914 in_regs[i] = tmp_vmreg; 1915 temploc = i; 1916 continue; 1917 } else if (i == -1) { 1918 assert(is_critical_native, "should only be required for critical natives"); 1919 // Read from the temporary location 1920 assert(temploc != -1, "must be valid"); 1921 i = temploc; 1922 temploc = -1; 1923 } 1924 #ifdef ASSERT 1925 if (in_regs[i].first()->is_Register()) { 1926 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!"); 1927 } else if (in_regs[i].first()->is_XMMRegister()) { 1928 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!"); 1929 } 1930 if (out_regs[c_arg].first()->is_Register()) { 1931 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 1932 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 1933 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 1934 } 1935 #endif /* ASSERT */ 1936 switch (in_sig_bt[i]) { 1937 case T_ARRAY: 1938 if (is_critical_native) { 1939 unpack_array_argument(masm, in_regs[i], in_elem_bt[i], out_regs[c_arg + 1], out_regs[c_arg]); 1940 c_arg++; 1941 #ifdef ASSERT 1942 if (out_regs[c_arg].first()->is_Register()) { 1943 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 1944 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 1945 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 1946 } 1947 #endif 1948 break; 1949 } 1950 case T_OBJECT: 1951 assert(!is_critical_native, "no oop arguments"); 1952 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg], 1953 ((i == 0) && (!is_static)), 1954 &receiver_offset); 1955 break; 1956 case T_VOID: 1957 break; 1958 1959 case T_FLOAT: 1960 __ float_move(in_regs[i], out_regs[c_arg]); 1961 break; 1962 1963 case T_DOUBLE: 1964 assert( i + 1 < total_in_args && 1965 in_sig_bt[i + 1] == T_VOID && 1966 out_sig_bt[c_arg+1] == T_VOID, "bad arg list"); 1967 __ double_move(in_regs[i], out_regs[c_arg]); 1968 break; 1969 1970 case T_LONG : 1971 __ long_move(in_regs[i], out_regs[c_arg]); 1972 break; 1973 1974 case T_ADDRESS: assert(false, "found T_ADDRESS in java args"); 1975 1976 default: 1977 __ move32_64(in_regs[i], out_regs[c_arg]); 1978 } 1979 } 1980 1981 int c_arg; 1982 1983 // Pre-load a static method's oop into r14. Used both by locking code and 1984 // the normal JNI call code. 1985 if (!is_critical_native) { 1986 // point c_arg at the first arg that is already loaded in case we 1987 // need to spill before we call out 1988 c_arg = total_c_args - total_in_args; 1989 1990 if (method->is_static()) { 1991 1992 // load oop into a register 1993 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror())); 1994 1995 // Now handlize the static class mirror it's known not-null. 1996 __ movptr(Address(rsp, klass_offset), oop_handle_reg); 1997 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset)); 1998 1999 // Now get the handle 2000 __ lea(oop_handle_reg, Address(rsp, klass_offset)); 2001 // store the klass handle as second argument 2002 __ movptr(c_rarg1, oop_handle_reg); 2003 // and protect the arg if we must spill 2004 c_arg--; 2005 } 2006 } else { 2007 // For JNI critical methods we need to save all registers in save_args. 2008 c_arg = 0; 2009 } 2010 2011 // Change state to native (we save the return address in the thread, since it might not 2012 // be pushed on the stack when we do a a stack traversal). It is enough that the pc() 2013 // points into the right code segment. It does not have to be the correct return pc. 2014 // We use the same pc/oopMap repeatedly when we call out 2015 2016 intptr_t the_pc = (intptr_t) __ pc(); 2017 oop_maps->add_gc_map(the_pc - start, map); 2018 2019 __ set_last_Java_frame(rsp, noreg, (address)the_pc); 2020 2021 2022 // We have all of the arguments setup at this point. We must not touch any register 2023 // argument registers at this point (what if we save/restore them there are no oop? 2024 2025 { 2026 SkipIfEqual skip(masm, &DTraceMethodProbes, false); 2027 // protect the args we've loaded 2028 save_args(masm, total_c_args, c_arg, out_regs); 2029 __ mov_metadata(c_rarg1, method()); 2030 __ call_VM_leaf( 2031 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry), 2032 r15_thread, c_rarg1); 2033 restore_args(masm, total_c_args, c_arg, out_regs); 2034 } 2035 2036 // RedefineClasses() tracing support for obsolete method entry 2037 if (log_is_enabled(Trace, redefine, class, obsolete)) { 2038 // protect the args we've loaded 2039 save_args(masm, total_c_args, c_arg, out_regs); 2040 __ mov_metadata(c_rarg1, method()); 2041 __ call_VM_leaf( 2042 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry), 2043 r15_thread, c_rarg1); 2044 restore_args(masm, total_c_args, c_arg, out_regs); 2045 } 2046 2047 // Lock a synchronized method 2048 2049 // Register definitions used by locking and unlocking 2050 2051 const Register swap_reg = rax; // Must use rax for cmpxchg instruction 2052 const Register obj_reg = rbx; // Will contain the oop 2053 const Register lock_reg = r13; // Address of compiler lock object (BasicLock) 2054 const Register old_hdr = r13; // value of old header at unlock time 2055 2056 Label slow_path_lock; 2057 Label lock_done; 2058 2059 if (method->is_synchronized()) { 2060 assert(!is_critical_native, "unhandled"); 2061 2062 2063 const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes(); 2064 2065 // Get the handle (the 2nd argument) 2066 __ mov(oop_handle_reg, c_rarg1); 2067 2068 // Get address of the box 2069 2070 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2071 2072 // Load the oop from the handle 2073 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2074 2075 if (LockingMode == LM_MONITOR) { 2076 __ jmp(slow_path_lock); 2077 } else if (LockingMode == LM_LEGACY) { 2078 if (UseBiasedLocking) { 2079 __ biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch1, rscratch2, false, lock_done, &slow_path_lock); 2080 } 2081 2082 // Load immediate 1 into swap_reg %rax 2083 __ movl(swap_reg, 1); 2084 2085 // Load (object->mark() | 1) into swap_reg %rax 2086 __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2087 2088 // Save (object->mark() | 1) into BasicLock's displaced header 2089 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2090 2091 // src -> dest iff dest == rax else rax <- dest 2092 __ lock(); 2093 __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2094 __ jcc(Assembler::equal, lock_done); 2095 2096 // Hmm should this move to the slow path code area??? 2097 2098 // Test if the oopMark is an obvious stack pointer, i.e., 2099 // 1) (mark & 3) == 0, and 2100 // 2) rsp <= mark < mark + os::pagesize() 2101 // These 3 tests can be done by evaluating the following 2102 // expression: ((mark - rsp) & (3 - os::vm_page_size())), 2103 // assuming both stack pointer and pagesize have their 2104 // least significant 2 bits clear. 2105 // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg 2106 2107 __ subptr(swap_reg, rsp); 2108 __ andptr(swap_reg, 3 - os::vm_page_size()); 2109 2110 // Save the test result, for recursive case, the result is zero 2111 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2112 __ jcc(Assembler::notEqual, slow_path_lock); 2113 } else { 2114 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2115 // Load object header 2116 __ movptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2117 __ fast_lock_impl(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock); 2118 } 2119 2120 // Slow path will re-enter here 2121 2122 __ bind(lock_done); 2123 } 2124 2125 // Finally just about ready to make the JNI call 2126 2127 // get JNIEnv* which is first argument to native 2128 if (!is_critical_native) { 2129 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset()))); 2130 2131 // Now set thread in native 2132 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 2133 } 2134 2135 __ call(RuntimeAddress(native_func)); 2136 2137 // Verify or restore cpu control state after JNI call 2138 __ restore_cpu_control_state_after_jni(); 2139 2140 // Unpack native results. 2141 switch (ret_type) { 2142 case T_BOOLEAN: __ c2bool(rax); break; 2143 case T_CHAR : __ movzwl(rax, rax); break; 2144 case T_BYTE : __ sign_extend_byte (rax); break; 2145 case T_SHORT : __ sign_extend_short(rax); break; 2146 case T_INT : /* nothing to do */ break; 2147 case T_DOUBLE : 2148 case T_FLOAT : 2149 // Result is in xmm0 we'll save as needed 2150 break; 2151 case T_ARRAY: // Really a handle 2152 case T_OBJECT: // Really a handle 2153 break; // can't de-handlize until after safepoint check 2154 case T_VOID: break; 2155 case T_LONG: break; 2156 default : ShouldNotReachHere(); 2157 } 2158 2159 Label after_transition; 2160 2161 // If this is a critical native, check for a safepoint or suspend request after the call. 2162 // If a safepoint is needed, transition to native, then to native_trans to handle 2163 // safepoints like the native methods that are not critical natives. 2164 if (is_critical_native) { 2165 Label needs_safepoint; 2166 __ safepoint_poll(needs_safepoint, r15_thread, false /* at_return */, false /* in_nmethod */); 2167 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2168 __ jcc(Assembler::equal, after_transition); 2169 __ bind(needs_safepoint); 2170 } 2171 2172 // Switch thread to "native transition" state before reading the synchronization state. 2173 // This additional state is necessary because reading and testing the synchronization 2174 // state is not atomic w.r.t. GC, as this scenario demonstrates: 2175 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted. 2176 // VM thread changes sync state to synchronizing and suspends threads for GC. 2177 // Thread A is resumed to finish this native method, but doesn't block here since it 2178 // didn't see any synchronization is progress, and escapes. 2179 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 2180 2181 // Force this write out before the read below 2182 __ membar(Assembler::Membar_mask_bits( 2183 Assembler::LoadLoad | Assembler::LoadStore | 2184 Assembler::StoreLoad | Assembler::StoreStore)); 2185 2186 // check for safepoint operation in progress and/or pending suspend requests 2187 { 2188 Label Continue; 2189 Label slow_path; 2190 2191 __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */); 2192 2193 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2194 __ jcc(Assembler::equal, Continue); 2195 __ bind(slow_path); 2196 2197 // Don't use call_VM as it will see a possible pending exception and forward it 2198 // and never return here preventing us from clearing _last_native_pc down below. 2199 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are 2200 // preserved and correspond to the bcp/locals pointers. So we do a runtime call 2201 // by hand. 2202 // 2203 __ vzeroupper(); 2204 save_native_result(masm, ret_type, stack_slots); 2205 __ mov(c_rarg0, r15_thread); 2206 __ mov(r12, rsp); // remember sp 2207 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2208 __ andptr(rsp, -16); // align stack as required by ABI 2209 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 2210 __ mov(rsp, r12); // restore sp 2211 __ reinit_heapbase(); 2212 // Restore any method result value 2213 restore_native_result(masm, ret_type, stack_slots); 2214 __ bind(Continue); 2215 } 2216 2217 // change thread state 2218 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 2219 __ bind(after_transition); 2220 2221 Label reguard; 2222 Label reguard_done; 2223 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 2224 __ jcc(Assembler::equal, reguard); 2225 __ bind(reguard_done); 2226 2227 // native result if any is live 2228 2229 // Unlock 2230 Label unlock_done; 2231 Label slow_path_unlock; 2232 if (method->is_synchronized()) { 2233 2234 // Get locked oop from the handle we passed to jni 2235 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2236 2237 Label done; 2238 2239 if (UseBiasedLocking) { 2240 __ biased_locking_exit(obj_reg, old_hdr, done); 2241 } 2242 2243 if (LockingMode == LM_LEGACY) { 2244 // Simple recursive lock? 2245 2246 __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD); 2247 __ jcc(Assembler::equal, done); 2248 } 2249 2250 // Must save rax if if it is live now because cmpxchg must use it 2251 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2252 save_native_result(masm, ret_type, stack_slots); 2253 } 2254 2255 if (LockingMode == LM_MONITOR) { 2256 __ jmp(slow_path_unlock); 2257 } else if (LockingMode == LM_LEGACY) { 2258 // get address of the stack lock 2259 __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2260 // get old displaced header 2261 __ movptr(old_hdr, Address(rax, 0)); 2262 2263 // Atomic swap old header if oop still contains the stack lock 2264 __ lock(); 2265 __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2266 __ jcc(Assembler::notEqual, slow_path_unlock); 2267 } else { 2268 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2269 __ movptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2270 __ andptr(swap_reg, ~(int32_t)markWord::lock_mask_in_place); 2271 __ fast_unlock_impl(obj_reg, swap_reg, lock_reg, slow_path_unlock); 2272 } 2273 2274 // slow path re-enters here 2275 __ bind(unlock_done); 2276 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2277 restore_native_result(masm, ret_type, stack_slots); 2278 } 2279 2280 __ bind(done); 2281 2282 } 2283 { 2284 SkipIfEqual skip(masm, &DTraceMethodProbes, false); 2285 save_native_result(masm, ret_type, stack_slots); 2286 __ mov_metadata(c_rarg1, method()); 2287 __ call_VM_leaf( 2288 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit), 2289 r15_thread, c_rarg1); 2290 restore_native_result(masm, ret_type, stack_slots); 2291 } 2292 2293 __ reset_last_Java_frame(false); 2294 2295 // Unbox oop result, e.g. JNIHandles::resolve value. 2296 if (is_reference_type(ret_type)) { 2297 __ resolve_jobject(rax /* value */, 2298 r15_thread /* thread */, 2299 rcx /* tmp */); 2300 } 2301 2302 if (CheckJNICalls) { 2303 // clear_pending_jni_exception_check 2304 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD); 2305 } 2306 2307 if (!is_critical_native) { 2308 // reset handle block 2309 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset())); 2310 __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD); 2311 } 2312 2313 // pop our frame 2314 2315 __ leave(); 2316 2317 if (!is_critical_native) { 2318 // Any exception pending? 2319 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD); 2320 __ jcc(Assembler::notEqual, exception_pending); 2321 } 2322 2323 // Return 2324 2325 __ ret(0); 2326 2327 // Unexpected paths are out of line and go here 2328 2329 if (!is_critical_native) { 2330 // forward the exception 2331 __ bind(exception_pending); 2332 2333 // and forward the exception 2334 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 2335 } 2336 2337 // Slow path locking & unlocking 2338 if (method->is_synchronized()) { 2339 2340 // BEGIN Slow path lock 2341 __ bind(slow_path_lock); 2342 2343 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM 2344 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2345 2346 // protect the args we've loaded 2347 save_args(masm, total_c_args, c_arg, out_regs); 2348 2349 __ mov(c_rarg0, obj_reg); 2350 __ mov(c_rarg1, lock_reg); 2351 __ mov(c_rarg2, r15_thread); 2352 2353 // Not a leaf but we have last_Java_frame setup as we want 2354 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3); 2355 restore_args(masm, total_c_args, c_arg, out_regs); 2356 2357 #ifdef ASSERT 2358 { Label L; 2359 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD); 2360 __ jcc(Assembler::equal, L); 2361 __ stop("no pending exception allowed on exit from monitorenter"); 2362 __ bind(L); 2363 } 2364 #endif 2365 __ jmp(lock_done); 2366 2367 // END Slow path lock 2368 2369 // BEGIN Slow path unlock 2370 __ bind(slow_path_unlock); 2371 2372 // If we haven't already saved the native result we must save it now as xmm registers 2373 // are still exposed. 2374 __ vzeroupper(); 2375 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2376 save_native_result(masm, ret_type, stack_slots); 2377 } 2378 2379 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2380 2381 __ mov(c_rarg0, obj_reg); 2382 __ mov(c_rarg2, r15_thread); 2383 __ mov(r12, rsp); // remember sp 2384 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2385 __ andptr(rsp, -16); // align stack as required by ABI 2386 2387 // Save pending exception around call to VM (which contains an EXCEPTION_MARK) 2388 // NOTE that obj_reg == rbx currently 2389 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset()))); 2390 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD); 2391 2392 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2393 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C))); 2394 __ mov(rsp, r12); // restore sp 2395 __ reinit_heapbase(); 2396 #ifdef ASSERT 2397 { 2398 Label L; 2399 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD); 2400 __ jcc(Assembler::equal, L); 2401 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C"); 2402 __ bind(L); 2403 } 2404 #endif /* ASSERT */ 2405 2406 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx); 2407 2408 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2409 restore_native_result(masm, ret_type, stack_slots); 2410 } 2411 __ jmp(unlock_done); 2412 2413 // END Slow path unlock 2414 2415 } // synchronized 2416 2417 // SLOW PATH Reguard the stack if needed 2418 2419 __ bind(reguard); 2420 __ vzeroupper(); 2421 save_native_result(masm, ret_type, stack_slots); 2422 __ mov(r12, rsp); // remember sp 2423 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2424 __ andptr(rsp, -16); // align stack as required by ABI 2425 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 2426 __ mov(rsp, r12); // restore sp 2427 __ reinit_heapbase(); 2428 restore_native_result(masm, ret_type, stack_slots); 2429 // and continue 2430 __ jmp(reguard_done); 2431 2432 2433 2434 __ flush(); 2435 2436 nmethod *nm = nmethod::new_native_nmethod(method, 2437 compile_id, 2438 masm->code(), 2439 vep_offset, 2440 frame_complete, 2441 stack_slots / VMRegImpl::slots_per_word, 2442 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)), 2443 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size), 2444 oop_maps); 2445 2446 return nm; 2447 } 2448 2449 // this function returns the adjust size (in number of words) to a c2i adapter 2450 // activation for use during deoptimization 2451 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) { 2452 return (callee_locals - callee_parameters) * Interpreter::stackElementWords; 2453 } 2454 2455 2456 uint SharedRuntime::out_preserve_stack_slots() { 2457 return 0; 2458 } 2459 2460 2461 // Number of stack slots between incoming argument block and the start of 2462 // a new frame. The PROLOG must add this many slots to the stack. The 2463 // EPILOG must remove this many slots. amd64 needs two slots for 2464 // return address. 2465 uint SharedRuntime::in_preserve_stack_slots() { 2466 return 4 + 2 * VerifyStackAtCalls; 2467 } 2468 2469 //------------------------------generate_deopt_blob---------------------------- 2470 void SharedRuntime::generate_deopt_blob() { 2471 // Allocate space for the code 2472 ResourceMark rm; 2473 // Setup code generation tools 2474 int pad = 0; 2475 if (UseAVX > 2) { 2476 pad += 1024; 2477 } 2478 #if INCLUDE_JVMCI 2479 if (EnableJVMCI) { 2480 pad += 512; // Increase the buffer size when compiling for JVMCI 2481 } 2482 #endif 2483 CodeBuffer buffer("deopt_blob", 2560+pad, 1024); 2484 MacroAssembler* masm = new MacroAssembler(&buffer); 2485 int frame_size_in_words; 2486 OopMap* map = NULL; 2487 OopMapSet *oop_maps = new OopMapSet(); 2488 2489 // ------------- 2490 // This code enters when returning to a de-optimized nmethod. A return 2491 // address has been pushed on the the stack, and return values are in 2492 // registers. 2493 // If we are doing a normal deopt then we were called from the patched 2494 // nmethod from the point we returned to the nmethod. So the return 2495 // address on the stack is wrong by NativeCall::instruction_size 2496 // We will adjust the value so it looks like we have the original return 2497 // address on the stack (like when we eagerly deoptimized). 2498 // In the case of an exception pending when deoptimizing, we enter 2499 // with a return address on the stack that points after the call we patched 2500 // into the exception handler. We have the following register state from, 2501 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp). 2502 // rax: exception oop 2503 // rbx: exception handler 2504 // rdx: throwing pc 2505 // So in this case we simply jam rdx into the useless return address and 2506 // the stack looks just like we want. 2507 // 2508 // At this point we need to de-opt. We save the argument return 2509 // registers. We call the first C routine, fetch_unroll_info(). This 2510 // routine captures the return values and returns a structure which 2511 // describes the current frame size and the sizes of all replacement frames. 2512 // The current frame is compiled code and may contain many inlined 2513 // functions, each with their own JVM state. We pop the current frame, then 2514 // push all the new frames. Then we call the C routine unpack_frames() to 2515 // populate these frames. Finally unpack_frames() returns us the new target 2516 // address. Notice that callee-save registers are BLOWN here; they have 2517 // already been captured in the vframeArray at the time the return PC was 2518 // patched. 2519 address start = __ pc(); 2520 Label cont; 2521 2522 // Prolog for non exception case! 2523 2524 // Save everything in sight. 2525 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2526 2527 // Normal deoptimization. Save exec mode for unpack_frames. 2528 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved 2529 __ jmp(cont); 2530 2531 int reexecute_offset = __ pc() - start; 2532 #if INCLUDE_JVMCI && !defined(COMPILER1) 2533 if (EnableJVMCI && UseJVMCICompiler) { 2534 // JVMCI does not use this kind of deoptimization 2535 __ should_not_reach_here(); 2536 } 2537 #endif 2538 2539 // Reexecute case 2540 // return address is the pc describes what bci to do re-execute at 2541 2542 // No need to update map as each call to save_live_registers will produce identical oopmap 2543 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2544 2545 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved 2546 __ jmp(cont); 2547 2548 #if INCLUDE_JVMCI 2549 Label after_fetch_unroll_info_call; 2550 int implicit_exception_uncommon_trap_offset = 0; 2551 int uncommon_trap_offset = 0; 2552 2553 if (EnableJVMCI) { 2554 implicit_exception_uncommon_trap_offset = __ pc() - start; 2555 2556 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset()))); 2557 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), (int32_t)NULL_WORD); 2558 2559 uncommon_trap_offset = __ pc() - start; 2560 2561 // Save everything in sight. 2562 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2563 // fetch_unroll_info needs to call last_java_frame() 2564 __ set_last_Java_frame(noreg, noreg, NULL); 2565 2566 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset()))); 2567 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1); 2568 2569 __ movl(r14, (int32_t)Deoptimization::Unpack_reexecute); 2570 __ mov(c_rarg0, r15_thread); 2571 __ movl(c_rarg2, r14); // exec mode 2572 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2573 oop_maps->add_gc_map( __ pc()-start, map->deep_copy()); 2574 2575 __ reset_last_Java_frame(false); 2576 2577 __ jmp(after_fetch_unroll_info_call); 2578 } // EnableJVMCI 2579 #endif // INCLUDE_JVMCI 2580 2581 int exception_offset = __ pc() - start; 2582 2583 // Prolog for exception case 2584 2585 // all registers are dead at this entry point, except for rax, and 2586 // rdx which contain the exception oop and exception pc 2587 // respectively. Set them in TLS and fall thru to the 2588 // unpack_with_exception_in_tls entry point. 2589 2590 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 2591 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax); 2592 2593 int exception_in_tls_offset = __ pc() - start; 2594 2595 // new implementation because exception oop is now passed in JavaThread 2596 2597 // Prolog for exception case 2598 // All registers must be preserved because they might be used by LinearScan 2599 // Exceptiop oop and throwing PC are passed in JavaThread 2600 // tos: stack at point of call to method that threw the exception (i.e. only 2601 // args are on the stack, no return address) 2602 2603 // make room on stack for the return address 2604 // It will be patched later with the throwing pc. The correct value is not 2605 // available now because loading it from memory would destroy registers. 2606 __ push(0); 2607 2608 // Save everything in sight. 2609 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2610 2611 // Now it is safe to overwrite any register 2612 2613 // Deopt during an exception. Save exec mode for unpack_frames. 2614 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved 2615 2616 // load throwing pc from JavaThread and patch it as the return address 2617 // of the current frame. Then clear the field in JavaThread 2618 2619 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2620 __ movptr(Address(rbp, wordSize), rdx); 2621 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD); 2622 2623 #ifdef ASSERT 2624 // verify that there is really an exception oop in JavaThread 2625 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2626 __ verify_oop(rax); 2627 2628 // verify that there is no pending exception 2629 Label no_pending_exception; 2630 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 2631 __ testptr(rax, rax); 2632 __ jcc(Assembler::zero, no_pending_exception); 2633 __ stop("must not have pending exception here"); 2634 __ bind(no_pending_exception); 2635 #endif 2636 2637 __ bind(cont); 2638 2639 // Call C code. Need thread and this frame, but NOT official VM entry 2640 // crud. We cannot block on this call, no GC can happen. 2641 // 2642 // UnrollBlock* fetch_unroll_info(JavaThread* thread) 2643 2644 // fetch_unroll_info needs to call last_java_frame(). 2645 2646 __ set_last_Java_frame(noreg, noreg, NULL); 2647 #ifdef ASSERT 2648 { Label L; 2649 __ cmpptr(Address(r15_thread, 2650 JavaThread::last_Java_fp_offset()), 2651 (int32_t)0); 2652 __ jcc(Assembler::equal, L); 2653 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared"); 2654 __ bind(L); 2655 } 2656 #endif // ASSERT 2657 __ mov(c_rarg0, r15_thread); 2658 __ movl(c_rarg1, r14); // exec_mode 2659 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info))); 2660 2661 // Need to have an oopmap that tells fetch_unroll_info where to 2662 // find any register it might need. 2663 oop_maps->add_gc_map(__ pc() - start, map); 2664 2665 __ reset_last_Java_frame(false); 2666 2667 #if INCLUDE_JVMCI 2668 if (EnableJVMCI) { 2669 __ bind(after_fetch_unroll_info_call); 2670 } 2671 #endif 2672 2673 // Load UnrollBlock* into rdi 2674 __ mov(rdi, rax); 2675 2676 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes())); 2677 Label noException; 2678 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending? 2679 __ jcc(Assembler::notEqual, noException); 2680 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2681 // QQQ this is useless it was NULL above 2682 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2683 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD); 2684 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD); 2685 2686 __ verify_oop(rax); 2687 2688 // Overwrite the result registers with the exception results. 2689 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2690 // I think this is useless 2691 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx); 2692 2693 __ bind(noException); 2694 2695 // Only register save data is on the stack. 2696 // Now restore the result registers. Everything else is either dead 2697 // or captured in the vframeArray. 2698 RegisterSaver::restore_result_registers(masm); 2699 2700 // All of the register save area has been popped of the stack. Only the 2701 // return address remains. 2702 2703 // Pop all the frames we must move/replace. 2704 // 2705 // Frame picture (youngest to oldest) 2706 // 1: self-frame (no frame link) 2707 // 2: deopting frame (no frame link) 2708 // 3: caller of deopting frame (could be compiled/interpreted). 2709 // 2710 // Note: by leaving the return address of self-frame on the stack 2711 // and using the size of frame 2 to adjust the stack 2712 // when we are done the return to frame 3 will still be on the stack. 2713 2714 // Pop deoptimized frame 2715 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes())); 2716 __ addptr(rsp, rcx); 2717 2718 // rsp should be pointing at the return address to the caller (3) 2719 2720 // Pick up the initial fp we should save 2721 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2722 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes())); 2723 2724 #ifdef ASSERT 2725 // Compilers generate code that bang the stack by as much as the 2726 // interpreter would need. So this stack banging should never 2727 // trigger a fault. Verify that it does not on non product builds. 2728 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes())); 2729 __ bang_stack_size(rbx, rcx); 2730 #endif 2731 2732 // Load address of array of frame pcs into rcx 2733 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes())); 2734 2735 // Trash the old pc 2736 __ addptr(rsp, wordSize); 2737 2738 // Load address of array of frame sizes into rsi 2739 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes())); 2740 2741 // Load counter into rdx 2742 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes())); 2743 2744 // Now adjust the caller's stack to make up for the extra locals 2745 // but record the original sp so that we can save it in the skeletal interpreter 2746 // frame and the stack walking of interpreter_sender will get the unextended sp 2747 // value and not the "real" sp value. 2748 2749 const Register sender_sp = r8; 2750 2751 __ mov(sender_sp, rsp); 2752 __ movl(rbx, Address(rdi, 2753 Deoptimization::UnrollBlock:: 2754 caller_adjustment_offset_in_bytes())); 2755 __ subptr(rsp, rbx); 2756 2757 // Push interpreter frames in a loop 2758 Label loop; 2759 __ bind(loop); 2760 __ movptr(rbx, Address(rsi, 0)); // Load frame size 2761 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand 2762 __ pushptr(Address(rcx, 0)); // Save return address 2763 __ enter(); // Save old & set new ebp 2764 __ subptr(rsp, rbx); // Prolog 2765 // This value is corrected by layout_activation_impl 2766 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD ); 2767 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable 2768 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 2769 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 2770 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 2771 __ decrementl(rdx); // Decrement counter 2772 __ jcc(Assembler::notZero, loop); 2773 __ pushptr(Address(rcx, 0)); // Save final return address 2774 2775 // Re-push self-frame 2776 __ enter(); // Save old & set new ebp 2777 2778 // Allocate a full sized register save area. 2779 // Return address and rbp are in place, so we allocate two less words. 2780 __ subptr(rsp, (frame_size_in_words - 2) * wordSize); 2781 2782 // Restore frame locals after moving the frame 2783 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0); 2784 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2785 2786 // Call C code. Need thread but NOT official VM entry 2787 // crud. We cannot block on this call, no GC can happen. Call should 2788 // restore return values to their stack-slots with the new SP. 2789 // 2790 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode) 2791 2792 // Use rbp because the frames look interpreted now 2793 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 2794 // Don't need the precise return PC here, just precise enough to point into this code blob. 2795 address the_pc = __ pc(); 2796 __ set_last_Java_frame(noreg, rbp, the_pc); 2797 2798 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI 2799 __ mov(c_rarg0, r15_thread); 2800 __ movl(c_rarg1, r14); // second arg: exec_mode 2801 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 2802 // Revert SP alignment after call since we're going to do some SP relative addressing below 2803 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset())); 2804 2805 // Set an oopmap for the call site 2806 // Use the same PC we used for the last java frame 2807 oop_maps->add_gc_map(the_pc - start, 2808 new OopMap( frame_size_in_words, 0 )); 2809 2810 // Clear fp AND pc 2811 __ reset_last_Java_frame(true); 2812 2813 // Collect return values 2814 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes())); 2815 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes())); 2816 // I think this is useless (throwing pc?) 2817 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes())); 2818 2819 // Pop self-frame. 2820 __ leave(); // Epilog 2821 2822 // Jump to interpreter 2823 __ ret(0); 2824 2825 // Make sure all code is generated 2826 masm->flush(); 2827 2828 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words); 2829 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset); 2830 #if INCLUDE_JVMCI 2831 if (EnableJVMCI) { 2832 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset); 2833 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset); 2834 } 2835 #endif 2836 } 2837 2838 #ifdef COMPILER2 2839 //------------------------------generate_uncommon_trap_blob-------------------- 2840 void SharedRuntime::generate_uncommon_trap_blob() { 2841 // Allocate space for the code 2842 ResourceMark rm; 2843 // Setup code generation tools 2844 CodeBuffer buffer("uncommon_trap_blob", 2048, 1024); 2845 MacroAssembler* masm = new MacroAssembler(&buffer); 2846 2847 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 2848 2849 address start = __ pc(); 2850 2851 if (UseRTMLocking) { 2852 // Abort RTM transaction before possible nmethod deoptimization. 2853 __ xabort(0); 2854 } 2855 2856 // Push self-frame. We get here with a return address on the 2857 // stack, so rsp is 8-byte aligned until we allocate our frame. 2858 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog! 2859 2860 // No callee saved registers. rbp is assumed implicitly saved 2861 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 2862 2863 // compiler left unloaded_class_index in j_rarg0 move to where the 2864 // runtime expects it. 2865 __ movl(c_rarg1, j_rarg0); 2866 2867 __ set_last_Java_frame(noreg, noreg, NULL); 2868 2869 // Call C code. Need thread but NOT official VM entry 2870 // crud. We cannot block on this call, no GC can happen. Call should 2871 // capture callee-saved registers as well as return values. 2872 // Thread is in rdi already. 2873 // 2874 // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index); 2875 2876 __ mov(c_rarg0, r15_thread); 2877 __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap); 2878 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2879 2880 // Set an oopmap for the call site 2881 OopMapSet* oop_maps = new OopMapSet(); 2882 OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0); 2883 2884 // location of rbp is known implicitly by the frame sender code 2885 2886 oop_maps->add_gc_map(__ pc() - start, map); 2887 2888 __ reset_last_Java_frame(false); 2889 2890 // Load UnrollBlock* into rdi 2891 __ mov(rdi, rax); 2892 2893 #ifdef ASSERT 2894 { Label L; 2895 __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()), 2896 (int32_t)Deoptimization::Unpack_uncommon_trap); 2897 __ jcc(Assembler::equal, L); 2898 __ stop("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap"); 2899 __ bind(L); 2900 } 2901 #endif 2902 2903 // Pop all the frames we must move/replace. 2904 // 2905 // Frame picture (youngest to oldest) 2906 // 1: self-frame (no frame link) 2907 // 2: deopting frame (no frame link) 2908 // 3: caller of deopting frame (could be compiled/interpreted). 2909 2910 // Pop self-frame. We have no frame, and must rely only on rax and rsp. 2911 __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog! 2912 2913 // Pop deoptimized frame (int) 2914 __ movl(rcx, Address(rdi, 2915 Deoptimization::UnrollBlock:: 2916 size_of_deoptimized_frame_offset_in_bytes())); 2917 __ addptr(rsp, rcx); 2918 2919 // rsp should be pointing at the return address to the caller (3) 2920 2921 // Pick up the initial fp we should save 2922 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2923 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes())); 2924 2925 #ifdef ASSERT 2926 // Compilers generate code that bang the stack by as much as the 2927 // interpreter would need. So this stack banging should never 2928 // trigger a fault. Verify that it does not on non product builds. 2929 __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes())); 2930 __ bang_stack_size(rbx, rcx); 2931 #endif 2932 2933 // Load address of array of frame pcs into rcx (address*) 2934 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes())); 2935 2936 // Trash the return pc 2937 __ addptr(rsp, wordSize); 2938 2939 // Load address of array of frame sizes into rsi (intptr_t*) 2940 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes())); 2941 2942 // Counter 2943 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int) 2944 2945 // Now adjust the caller's stack to make up for the extra locals but 2946 // record the original sp so that we can save it in the skeletal 2947 // interpreter frame and the stack walking of interpreter_sender 2948 // will get the unextended sp value and not the "real" sp value. 2949 2950 const Register sender_sp = r8; 2951 2952 __ mov(sender_sp, rsp); 2953 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int) 2954 __ subptr(rsp, rbx); 2955 2956 // Push interpreter frames in a loop 2957 Label loop; 2958 __ bind(loop); 2959 __ movptr(rbx, Address(rsi, 0)); // Load frame size 2960 __ subptr(rbx, 2 * wordSize); // We'll push pc and rbp by hand 2961 __ pushptr(Address(rcx, 0)); // Save return address 2962 __ enter(); // Save old & set new rbp 2963 __ subptr(rsp, rbx); // Prolog 2964 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), 2965 sender_sp); // Make it walkable 2966 // This value is corrected by layout_activation_impl 2967 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD ); 2968 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 2969 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 2970 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 2971 __ decrementl(rdx); // Decrement counter 2972 __ jcc(Assembler::notZero, loop); 2973 __ pushptr(Address(rcx, 0)); // Save final return address 2974 2975 // Re-push self-frame 2976 __ enter(); // Save old & set new rbp 2977 __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt); 2978 // Prolog 2979 2980 // Use rbp because the frames look interpreted now 2981 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 2982 // Don't need the precise return PC here, just precise enough to point into this code blob. 2983 address the_pc = __ pc(); 2984 __ set_last_Java_frame(noreg, rbp, the_pc); 2985 2986 // Call C code. Need thread but NOT official VM entry 2987 // crud. We cannot block on this call, no GC can happen. Call should 2988 // restore return values to their stack-slots with the new SP. 2989 // Thread is in rdi already. 2990 // 2991 // BasicType unpack_frames(JavaThread* thread, int exec_mode); 2992 2993 __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI 2994 __ mov(c_rarg0, r15_thread); 2995 __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap); 2996 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 2997 2998 // Set an oopmap for the call site 2999 // Use the same PC we used for the last java frame 3000 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 3001 3002 // Clear fp AND pc 3003 __ reset_last_Java_frame(true); 3004 3005 // Pop self-frame. 3006 __ leave(); // Epilog 3007 3008 // Jump to interpreter 3009 __ ret(0); 3010 3011 // Make sure all code is generated 3012 masm->flush(); 3013 3014 _uncommon_trap_blob = UncommonTrapBlob::create(&buffer, oop_maps, 3015 SimpleRuntimeFrame::framesize >> 1); 3016 } 3017 #endif // COMPILER2 3018 3019 //------------------------------generate_handler_blob------ 3020 // 3021 // Generate a special Compile2Runtime blob that saves all registers, 3022 // and setup oopmap. 3023 // 3024 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) { 3025 assert(StubRoutines::forward_exception_entry() != NULL, 3026 "must be generated before"); 3027 3028 ResourceMark rm; 3029 OopMapSet *oop_maps = new OopMapSet(); 3030 OopMap* map; 3031 3032 // Allocate space for the code. Setup code generation tools. 3033 CodeBuffer buffer("handler_blob", 2048, 1024); 3034 MacroAssembler* masm = new MacroAssembler(&buffer); 3035 3036 address start = __ pc(); 3037 address call_pc = NULL; 3038 int frame_size_in_words; 3039 bool cause_return = (poll_type == POLL_AT_RETURN); 3040 bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP); 3041 3042 if (UseRTMLocking) { 3043 // Abort RTM transaction before calling runtime 3044 // because critical section will be large and will be 3045 // aborted anyway. Also nmethod could be deoptimized. 3046 __ xabort(0); 3047 } 3048 3049 // Make room for return address (or push it again) 3050 if (!cause_return) { 3051 __ push(rbx); 3052 } 3053 3054 // Save registers, fpu state, and flags 3055 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors); 3056 3057 // The following is basically a call_VM. However, we need the precise 3058 // address of the call in order to generate an oopmap. Hence, we do all the 3059 // work outselves. 3060 3061 __ set_last_Java_frame(noreg, noreg, NULL); 3062 3063 // The return address must always be correct so that frame constructor never 3064 // sees an invalid pc. 3065 3066 if (!cause_return) { 3067 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack. 3068 // Additionally, rbx is a callee saved register and we can look at it later to determine 3069 // if someone changed the return address for us! 3070 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset())); 3071 __ movptr(Address(rbp, wordSize), rbx); 3072 } 3073 3074 // Do the call 3075 __ mov(c_rarg0, r15_thread); 3076 __ call(RuntimeAddress(call_ptr)); 3077 3078 // Set an oopmap for the call site. This oopmap will map all 3079 // oop-registers and debug-info registers as callee-saved. This 3080 // will allow deoptimization at this safepoint to find all possible 3081 // debug-info recordings, as well as let GC find all oops. 3082 3083 oop_maps->add_gc_map( __ pc() - start, map); 3084 3085 Label noException; 3086 3087 __ reset_last_Java_frame(false); 3088 3089 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD); 3090 __ jcc(Assembler::equal, noException); 3091 3092 // Exception pending 3093 3094 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3095 3096 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3097 3098 // No exception case 3099 __ bind(noException); 3100 3101 Label no_adjust; 3102 #ifdef ASSERT 3103 Label bail; 3104 #endif 3105 if (!cause_return) { 3106 Label no_prefix, not_special; 3107 3108 // If our stashed return pc was modified by the runtime we avoid touching it 3109 __ cmpptr(rbx, Address(rbp, wordSize)); 3110 __ jccb(Assembler::notEqual, no_adjust); 3111 3112 // Skip over the poll instruction. 3113 // See NativeInstruction::is_safepoint_poll() 3114 // Possible encodings: 3115 // 85 00 test %eax,(%rax) 3116 // 85 01 test %eax,(%rcx) 3117 // 85 02 test %eax,(%rdx) 3118 // 85 03 test %eax,(%rbx) 3119 // 85 06 test %eax,(%rsi) 3120 // 85 07 test %eax,(%rdi) 3121 // 3122 // 41 85 00 test %eax,(%r8) 3123 // 41 85 01 test %eax,(%r9) 3124 // 41 85 02 test %eax,(%r10) 3125 // 41 85 03 test %eax,(%r11) 3126 // 41 85 06 test %eax,(%r14) 3127 // 41 85 07 test %eax,(%r15) 3128 // 3129 // 85 04 24 test %eax,(%rsp) 3130 // 41 85 04 24 test %eax,(%r12) 3131 // 85 45 00 test %eax,0x0(%rbp) 3132 // 41 85 45 00 test %eax,0x0(%r13) 3133 3134 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix); 3135 __ jcc(Assembler::notEqual, no_prefix); 3136 __ addptr(rbx, 1); 3137 __ bind(no_prefix); 3138 #ifdef ASSERT 3139 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below 3140 #endif 3141 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values: 3142 // r12/rsp 0x04 3143 // r13/rbp 0x05 3144 __ movzbq(rcx, Address(rbx, 1)); 3145 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05 3146 __ subptr(rcx, 4); // looking for 0x00 .. 0x01 3147 __ cmpptr(rcx, 1); 3148 __ jcc(Assembler::above, not_special); 3149 __ addptr(rbx, 1); 3150 __ bind(not_special); 3151 #ifdef ASSERT 3152 // Verify the correct encoding of the poll we're about to skip. 3153 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl); 3154 __ jcc(Assembler::notEqual, bail); 3155 // Mask out the modrm bits 3156 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask); 3157 // rax encodes to 0, so if the bits are nonzero it's incorrect 3158 __ jcc(Assembler::notZero, bail); 3159 #endif 3160 // Adjust return pc forward to step over the safepoint poll instruction 3161 __ addptr(rbx, 2); 3162 __ movptr(Address(rbp, wordSize), rbx); 3163 } 3164 3165 __ bind(no_adjust); 3166 // Normal exit, restore registers and exit. 3167 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3168 __ ret(0); 3169 3170 #ifdef ASSERT 3171 __ bind(bail); 3172 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected"); 3173 #endif 3174 3175 // Make sure all code is generated 3176 masm->flush(); 3177 3178 // Fill-out other meta info 3179 return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words); 3180 } 3181 3182 // 3183 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss 3184 // 3185 // Generate a stub that calls into vm to find out the proper destination 3186 // of a java call. All the argument registers are live at this point 3187 // but since this is generic code we don't know what they are and the caller 3188 // must do any gc of the args. 3189 // 3190 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) { 3191 assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before"); 3192 3193 // allocate space for the code 3194 ResourceMark rm; 3195 3196 CodeBuffer buffer(name, 1000, 512); 3197 MacroAssembler* masm = new MacroAssembler(&buffer); 3198 3199 int frame_size_in_words; 3200 3201 OopMapSet *oop_maps = new OopMapSet(); 3202 OopMap* map = NULL; 3203 3204 int start = __ offset(); 3205 3206 // No need to save vector registers since they are caller-saved anyway. 3207 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false); 3208 3209 int frame_complete = __ offset(); 3210 3211 __ set_last_Java_frame(noreg, noreg, NULL); 3212 3213 __ mov(c_rarg0, r15_thread); 3214 3215 __ call(RuntimeAddress(destination)); 3216 3217 3218 // Set an oopmap for the call site. 3219 // We need this not only for callee-saved registers, but also for volatile 3220 // registers that the compiler might be keeping live across a safepoint. 3221 3222 oop_maps->add_gc_map( __ offset() - start, map); 3223 3224 // rax contains the address we are going to jump to assuming no exception got installed 3225 3226 // clear last_Java_sp 3227 __ reset_last_Java_frame(false); 3228 // check for pending exceptions 3229 Label pending; 3230 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD); 3231 __ jcc(Assembler::notEqual, pending); 3232 3233 // get the returned Method* 3234 __ get_vm_result_2(rbx, r15_thread); 3235 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx); 3236 3237 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 3238 3239 RegisterSaver::restore_live_registers(masm); 3240 3241 // We are back the the original state on entry and ready to go. 3242 3243 __ jmp(rax); 3244 3245 // Pending exception after the safepoint 3246 3247 __ bind(pending); 3248 3249 RegisterSaver::restore_live_registers(masm); 3250 3251 // exception pending => remove activation and forward to exception handler 3252 3253 __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD); 3254 3255 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 3256 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3257 3258 // ------------- 3259 // make sure all code is generated 3260 masm->flush(); 3261 3262 // return the blob 3263 // frame_size_words or bytes?? 3264 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); 3265 } 3266 3267 #ifdef COMPILER2 3268 static const int native_invoker_code_size = MethodHandles::adapter_code_size; 3269 3270 class NativeInvokerGenerator : public StubCodeGenerator { 3271 address _call_target; 3272 int _shadow_space_bytes; 3273 3274 const GrowableArray<VMReg>& _input_registers; 3275 const GrowableArray<VMReg>& _output_registers; 3276 3277 int _frame_complete; 3278 int _framesize; 3279 OopMapSet* _oop_maps; 3280 public: 3281 NativeInvokerGenerator(CodeBuffer* buffer, 3282 address call_target, 3283 int shadow_space_bytes, 3284 const GrowableArray<VMReg>& input_registers, 3285 const GrowableArray<VMReg>& output_registers) 3286 : StubCodeGenerator(buffer, PrintMethodHandleStubs), 3287 _call_target(call_target), 3288 _shadow_space_bytes(shadow_space_bytes), 3289 _input_registers(input_registers), 3290 _output_registers(output_registers), 3291 _frame_complete(0), 3292 _framesize(0), 3293 _oop_maps(NULL) { 3294 assert(_output_registers.length() <= 1 3295 || (_output_registers.length() == 2 && !_output_registers.at(1)->is_valid()), "no multi-reg returns"); 3296 3297 } 3298 3299 void generate(); 3300 3301 int spill_size_in_bytes() const { 3302 if (_output_registers.length() == 0) { 3303 return 0; 3304 } 3305 VMReg reg = _output_registers.at(0); 3306 assert(reg->is_reg(), "must be a register"); 3307 if (reg->is_Register()) { 3308 return 8; 3309 } else if (reg->is_XMMRegister()) { 3310 if (UseAVX >= 3) { 3311 return 64; 3312 } else if (UseAVX >= 1) { 3313 return 32; 3314 } else { 3315 return 16; 3316 } 3317 } else { 3318 ShouldNotReachHere(); 3319 } 3320 return 0; 3321 } 3322 3323 void spill_out_registers() { 3324 if (_output_registers.length() == 0) { 3325 return; 3326 } 3327 VMReg reg = _output_registers.at(0); 3328 assert(reg->is_reg(), "must be a register"); 3329 MacroAssembler* masm = _masm; 3330 if (reg->is_Register()) { 3331 __ movptr(Address(rsp, 0), reg->as_Register()); 3332 } else if (reg->is_XMMRegister()) { 3333 if (UseAVX >= 3) { 3334 __ evmovdqul(Address(rsp, 0), reg->as_XMMRegister(), Assembler::AVX_512bit); 3335 } else if (UseAVX >= 1) { 3336 __ vmovdqu(Address(rsp, 0), reg->as_XMMRegister()); 3337 } else { 3338 __ movdqu(Address(rsp, 0), reg->as_XMMRegister()); 3339 } 3340 } else { 3341 ShouldNotReachHere(); 3342 } 3343 } 3344 3345 void fill_out_registers() { 3346 if (_output_registers.length() == 0) { 3347 return; 3348 } 3349 VMReg reg = _output_registers.at(0); 3350 assert(reg->is_reg(), "must be a register"); 3351 MacroAssembler* masm = _masm; 3352 if (reg->is_Register()) { 3353 __ movptr(reg->as_Register(), Address(rsp, 0)); 3354 } else if (reg->is_XMMRegister()) { 3355 if (UseAVX >= 3) { 3356 __ evmovdqul(reg->as_XMMRegister(), Address(rsp, 0), Assembler::AVX_512bit); 3357 } else if (UseAVX >= 1) { 3358 __ vmovdqu(reg->as_XMMRegister(), Address(rsp, 0)); 3359 } else { 3360 __ movdqu(reg->as_XMMRegister(), Address(rsp, 0)); 3361 } 3362 } else { 3363 ShouldNotReachHere(); 3364 } 3365 } 3366 3367 int frame_complete() const { 3368 return _frame_complete; 3369 } 3370 3371 int framesize() const { 3372 return (_framesize >> (LogBytesPerWord - LogBytesPerInt)); 3373 } 3374 3375 OopMapSet* oop_maps() const { 3376 return _oop_maps; 3377 } 3378 3379 private: 3380 #ifdef ASSERT 3381 bool target_uses_register(VMReg reg) { 3382 return _input_registers.contains(reg) || _output_registers.contains(reg); 3383 } 3384 #endif 3385 }; 3386 3387 RuntimeStub* SharedRuntime::make_native_invoker(address call_target, 3388 int shadow_space_bytes, 3389 const GrowableArray<VMReg>& input_registers, 3390 const GrowableArray<VMReg>& output_registers) { 3391 int locs_size = 64; 3392 CodeBuffer code("nep_invoker_blob", native_invoker_code_size, locs_size); 3393 NativeInvokerGenerator g(&code, call_target, shadow_space_bytes, input_registers, output_registers); 3394 g.generate(); 3395 code.log_section_sizes("nep_invoker_blob"); 3396 3397 RuntimeStub* stub = 3398 RuntimeStub::new_runtime_stub("nep_invoker_blob", 3399 &code, 3400 g.frame_complete(), 3401 g.framesize(), 3402 g.oop_maps(), false); 3403 return stub; 3404 } 3405 3406 void NativeInvokerGenerator::generate() { 3407 assert(!(target_uses_register(r15_thread->as_VMReg()) || target_uses_register(rscratch1->as_VMReg())), "Register conflict"); 3408 3409 enum layout { 3410 rbp_off, 3411 rbp_off2, 3412 return_off, 3413 return_off2, 3414 framesize // inclusive of return address 3415 }; 3416 3417 _framesize = align_up(framesize + ((_shadow_space_bytes + spill_size_in_bytes()) >> LogBytesPerInt), 4); 3418 assert(is_even(_framesize/2), "sp not 16-byte aligned"); 3419 3420 _oop_maps = new OopMapSet(); 3421 MacroAssembler* masm = _masm; 3422 3423 address start = __ pc(); 3424 3425 __ enter(); 3426 3427 // return address and rbp are already in place 3428 __ subptr(rsp, (_framesize-4) << LogBytesPerInt); // prolog 3429 3430 _frame_complete = __ pc() - start; 3431 3432 address the_pc = __ pc(); 3433 3434 __ set_last_Java_frame(rsp, rbp, (address)the_pc); 3435 OopMap* map = new OopMap(_framesize, 0); 3436 _oop_maps->add_gc_map(the_pc - start, map); 3437 3438 // State transition 3439 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 3440 3441 __ call(RuntimeAddress(_call_target)); 3442 3443 __ restore_cpu_control_state_after_jni(); 3444 3445 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 3446 3447 // Force this write out before the read below 3448 __ membar(Assembler::Membar_mask_bits( 3449 Assembler::LoadLoad | Assembler::LoadStore | 3450 Assembler::StoreLoad | Assembler::StoreStore)); 3451 3452 Label L_after_safepoint_poll; 3453 Label L_safepoint_poll_slow_path; 3454 3455 __ safepoint_poll(L_safepoint_poll_slow_path, r15_thread, true /* at_return */, false /* in_nmethod */); 3456 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 3457 __ jcc(Assembler::notEqual, L_safepoint_poll_slow_path); 3458 3459 __ bind(L_after_safepoint_poll); 3460 3461 // change thread state 3462 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 3463 3464 __ block_comment("reguard stack check"); 3465 Label L_reguard; 3466 Label L_after_reguard; 3467 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 3468 __ jcc(Assembler::equal, L_reguard); 3469 __ bind(L_after_reguard); 3470 3471 __ reset_last_Java_frame(r15_thread, true); 3472 3473 __ leave(); // required for proper stackwalking of RuntimeStub frame 3474 __ ret(0); 3475 3476 ////////////////////////////////////////////////////////////////////////////// 3477 3478 __ block_comment("{ L_safepoint_poll_slow_path"); 3479 __ bind(L_safepoint_poll_slow_path); 3480 __ vzeroupper(); 3481 3482 spill_out_registers(); 3483 3484 __ mov(c_rarg0, r15_thread); 3485 __ mov(r12, rsp); // remember sp 3486 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 3487 __ andptr(rsp, -16); // align stack as required by ABI 3488 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 3489 __ mov(rsp, r12); // restore sp 3490 __ reinit_heapbase(); 3491 3492 fill_out_registers(); 3493 3494 __ jmp(L_after_safepoint_poll); 3495 __ block_comment("} L_safepoint_poll_slow_path"); 3496 3497 ////////////////////////////////////////////////////////////////////////////// 3498 3499 __ block_comment("{ L_reguard"); 3500 __ bind(L_reguard); 3501 __ vzeroupper(); 3502 3503 spill_out_registers(); 3504 3505 __ mov(r12, rsp); // remember sp 3506 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 3507 __ andptr(rsp, -16); // align stack as required by ABI 3508 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 3509 __ mov(rsp, r12); // restore sp 3510 __ reinit_heapbase(); 3511 3512 fill_out_registers(); 3513 3514 __ jmp(L_after_reguard); 3515 3516 __ block_comment("} L_reguard"); 3517 3518 ////////////////////////////////////////////////////////////////////////////// 3519 3520 __ flush(); 3521 } 3522 #endif // COMPILER2 3523 3524 //------------------------------Montgomery multiplication------------------------ 3525 // 3526 3527 #ifndef _WINDOWS 3528 3529 // Subtract 0:b from carry:a. Return carry. 3530 static julong 3531 sub(julong a[], julong b[], julong carry, long len) { 3532 long long i = 0, cnt = len; 3533 julong tmp; 3534 asm volatile("clc; " 3535 "0: ; " 3536 "mov (%[b], %[i], 8), %[tmp]; " 3537 "sbb %[tmp], (%[a], %[i], 8); " 3538 "inc %[i]; dec %[cnt]; " 3539 "jne 0b; " 3540 "mov %[carry], %[tmp]; sbb $0, %[tmp]; " 3541 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp) 3542 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry) 3543 : "memory"); 3544 return tmp; 3545 } 3546 3547 // Multiply (unsigned) Long A by Long B, accumulating the double- 3548 // length result into the accumulator formed of T0, T1, and T2. 3549 #define MACC(A, B, T0, T1, T2) \ 3550 do { \ 3551 unsigned long hi, lo; \ 3552 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3553 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3554 : "r"(A), "a"(B) : "cc"); \ 3555 } while(0) 3556 3557 // As above, but add twice the double-length result into the 3558 // accumulator. 3559 #define MACC2(A, B, T0, T1, T2) \ 3560 do { \ 3561 unsigned long hi, lo; \ 3562 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \ 3563 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3564 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3565 : "r"(A), "a"(B) : "cc"); \ 3566 } while(0) 3567 3568 #else //_WINDOWS 3569 3570 static julong 3571 sub(julong a[], julong b[], julong carry, long len) { 3572 long i; 3573 julong tmp; 3574 unsigned char c = 1; 3575 for (i = 0; i < len; i++) { 3576 c = _addcarry_u64(c, a[i], ~b[i], &tmp); 3577 a[i] = tmp; 3578 } 3579 c = _addcarry_u64(c, carry, ~0, &tmp); 3580 return tmp; 3581 } 3582 3583 // Multiply (unsigned) Long A by Long B, accumulating the double- 3584 // length result into the accumulator formed of T0, T1, and T2. 3585 #define MACC(A, B, T0, T1, T2) \ 3586 do { \ 3587 julong hi, lo; \ 3588 lo = _umul128(A, B, &hi); \ 3589 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3590 c = _addcarry_u64(c, hi, T1, &T1); \ 3591 _addcarry_u64(c, T2, 0, &T2); \ 3592 } while(0) 3593 3594 // As above, but add twice the double-length result into the 3595 // accumulator. 3596 #define MACC2(A, B, T0, T1, T2) \ 3597 do { \ 3598 julong hi, lo; \ 3599 lo = _umul128(A, B, &hi); \ 3600 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3601 c = _addcarry_u64(c, hi, T1, &T1); \ 3602 _addcarry_u64(c, T2, 0, &T2); \ 3603 c = _addcarry_u64(0, lo, T0, &T0); \ 3604 c = _addcarry_u64(c, hi, T1, &T1); \ 3605 _addcarry_u64(c, T2, 0, &T2); \ 3606 } while(0) 3607 3608 #endif //_WINDOWS 3609 3610 // Fast Montgomery multiplication. The derivation of the algorithm is 3611 // in A Cryptographic Library for the Motorola DSP56000, 3612 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3613 3614 static void NOINLINE 3615 montgomery_multiply(julong a[], julong b[], julong n[], 3616 julong m[], julong inv, int len) { 3617 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3618 int i; 3619 3620 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply"); 3621 3622 for (i = 0; i < len; i++) { 3623 int j; 3624 for (j = 0; j < i; j++) { 3625 MACC(a[j], b[i-j], t0, t1, t2); 3626 MACC(m[j], n[i-j], t0, t1, t2); 3627 } 3628 MACC(a[i], b[0], t0, t1, t2); 3629 m[i] = t0 * inv; 3630 MACC(m[i], n[0], t0, t1, t2); 3631 3632 assert(t0 == 0, "broken Montgomery multiply"); 3633 3634 t0 = t1; t1 = t2; t2 = 0; 3635 } 3636 3637 for (i = len; i < 2*len; i++) { 3638 int j; 3639 for (j = i-len+1; j < len; j++) { 3640 MACC(a[j], b[i-j], t0, t1, t2); 3641 MACC(m[j], n[i-j], t0, t1, t2); 3642 } 3643 m[i-len] = t0; 3644 t0 = t1; t1 = t2; t2 = 0; 3645 } 3646 3647 while (t0) 3648 t0 = sub(m, n, t0, len); 3649 } 3650 3651 // Fast Montgomery squaring. This uses asymptotically 25% fewer 3652 // multiplies so it should be up to 25% faster than Montgomery 3653 // multiplication. However, its loop control is more complex and it 3654 // may actually run slower on some machines. 3655 3656 static void NOINLINE 3657 montgomery_square(julong a[], julong n[], 3658 julong m[], julong inv, int len) { 3659 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3660 int i; 3661 3662 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square"); 3663 3664 for (i = 0; i < len; i++) { 3665 int j; 3666 int end = (i+1)/2; 3667 for (j = 0; j < end; j++) { 3668 MACC2(a[j], a[i-j], t0, t1, t2); 3669 MACC(m[j], n[i-j], t0, t1, t2); 3670 } 3671 if ((i & 1) == 0) { 3672 MACC(a[j], a[j], t0, t1, t2); 3673 } 3674 for (; j < i; j++) { 3675 MACC(m[j], n[i-j], t0, t1, t2); 3676 } 3677 m[i] = t0 * inv; 3678 MACC(m[i], n[0], t0, t1, t2); 3679 3680 assert(t0 == 0, "broken Montgomery square"); 3681 3682 t0 = t1; t1 = t2; t2 = 0; 3683 } 3684 3685 for (i = len; i < 2*len; i++) { 3686 int start = i-len+1; 3687 int end = start + (len - start)/2; 3688 int j; 3689 for (j = start; j < end; j++) { 3690 MACC2(a[j], a[i-j], t0, t1, t2); 3691 MACC(m[j], n[i-j], t0, t1, t2); 3692 } 3693 if ((i & 1) == 0) { 3694 MACC(a[j], a[j], t0, t1, t2); 3695 } 3696 for (; j < len; j++) { 3697 MACC(m[j], n[i-j], t0, t1, t2); 3698 } 3699 m[i-len] = t0; 3700 t0 = t1; t1 = t2; t2 = 0; 3701 } 3702 3703 while (t0) 3704 t0 = sub(m, n, t0, len); 3705 } 3706 3707 // Swap words in a longword. 3708 static julong swap(julong x) { 3709 return (x << 32) | (x >> 32); 3710 } 3711 3712 // Copy len longwords from s to d, word-swapping as we go. The 3713 // destination array is reversed. 3714 static void reverse_words(julong *s, julong *d, int len) { 3715 d += len; 3716 while(len-- > 0) { 3717 d--; 3718 *d = swap(*s); 3719 s++; 3720 } 3721 } 3722 3723 // The threshold at which squaring is advantageous was determined 3724 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. 3725 #define MONTGOMERY_SQUARING_THRESHOLD 64 3726 3727 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, 3728 jint len, jlong inv, 3729 jint *m_ints) { 3730 assert(len % 2 == 0, "array length in montgomery_multiply must be even"); 3731 int longwords = len/2; 3732 3733 // Make very sure we don't use so much space that the stack might 3734 // overflow. 512 jints corresponds to an 16384-bit integer and 3735 // will use here a total of 8k bytes of stack space. 3736 int total_allocation = longwords * sizeof (julong) * 4; 3737 guarantee(total_allocation <= 8192, "must be"); 3738 julong *scratch = (julong *)alloca(total_allocation); 3739 3740 // Local scratch arrays 3741 julong 3742 *a = scratch + 0 * longwords, 3743 *b = scratch + 1 * longwords, 3744 *n = scratch + 2 * longwords, 3745 *m = scratch + 3 * longwords; 3746 3747 reverse_words((julong *)a_ints, a, longwords); 3748 reverse_words((julong *)b_ints, b, longwords); 3749 reverse_words((julong *)n_ints, n, longwords); 3750 3751 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords); 3752 3753 reverse_words(m, (julong *)m_ints, longwords); 3754 } 3755 3756 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, 3757 jint len, jlong inv, 3758 jint *m_ints) { 3759 assert(len % 2 == 0, "array length in montgomery_square must be even"); 3760 int longwords = len/2; 3761 3762 // Make very sure we don't use so much space that the stack might 3763 // overflow. 512 jints corresponds to an 16384-bit integer and 3764 // will use here a total of 6k bytes of stack space. 3765 int total_allocation = longwords * sizeof (julong) * 3; 3766 guarantee(total_allocation <= 8192, "must be"); 3767 julong *scratch = (julong *)alloca(total_allocation); 3768 3769 // Local scratch arrays 3770 julong 3771 *a = scratch + 0 * longwords, 3772 *n = scratch + 1 * longwords, 3773 *m = scratch + 2 * longwords; 3774 3775 reverse_words((julong *)a_ints, a, longwords); 3776 reverse_words((julong *)n_ints, n, longwords); 3777 3778 if (len >= MONTGOMERY_SQUARING_THRESHOLD) { 3779 ::montgomery_square(a, n, m, (julong)inv, longwords); 3780 } else { 3781 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords); 3782 } 3783 3784 reverse_words(m, (julong *)m_ints, longwords); 3785 } 3786 3787 #ifdef COMPILER2 3788 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame 3789 // 3790 //------------------------------generate_exception_blob--------------------------- 3791 // creates exception blob at the end 3792 // Using exception blob, this code is jumped from a compiled method. 3793 // (see emit_exception_handler in x86_64.ad file) 3794 // 3795 // Given an exception pc at a call we call into the runtime for the 3796 // handler in this method. This handler might merely restore state 3797 // (i.e. callee save registers) unwind the frame and jump to the 3798 // exception handler for the nmethod if there is no Java level handler 3799 // for the nmethod. 3800 // 3801 // This code is entered with a jmp. 3802 // 3803 // Arguments: 3804 // rax: exception oop 3805 // rdx: exception pc 3806 // 3807 // Results: 3808 // rax: exception oop 3809 // rdx: exception pc in caller or ??? 3810 // destination: exception handler of caller 3811 // 3812 // Note: the exception pc MUST be at a call (precise debug information) 3813 // Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved. 3814 // 3815 3816 void OptoRuntime::generate_exception_blob() { 3817 assert(!OptoRuntime::is_callee_saved_register(RDX_num), ""); 3818 assert(!OptoRuntime::is_callee_saved_register(RAX_num), ""); 3819 assert(!OptoRuntime::is_callee_saved_register(RCX_num), ""); 3820 3821 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 3822 3823 // Allocate space for the code 3824 ResourceMark rm; 3825 // Setup code generation tools 3826 CodeBuffer buffer("exception_blob", 2048, 1024); 3827 MacroAssembler* masm = new MacroAssembler(&buffer); 3828 3829 3830 address start = __ pc(); 3831 3832 // Exception pc is 'return address' for stack walker 3833 __ push(rdx); 3834 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog 3835 3836 // Save callee-saved registers. See x86_64.ad. 3837 3838 // rbp is an implicitly saved callee saved register (i.e., the calling 3839 // convention will save/restore it in the prolog/epilog). Other than that 3840 // there are no callee save registers now that adapter frames are gone. 3841 3842 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 3843 3844 // Store exception in Thread object. We cannot pass any arguments to the 3845 // handle_exception call, since we do not want to make any assumption 3846 // about the size of the frame where the exception happened in. 3847 // c_rarg0 is either rdi (Linux) or rcx (Windows). 3848 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax); 3849 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 3850 3851 // This call does all the hard work. It checks if an exception handler 3852 // exists in the method. 3853 // If so, it returns the handler address. 3854 // If not, it prepares for stack-unwinding, restoring the callee-save 3855 // registers of the frame being removed. 3856 // 3857 // address OptoRuntime::handle_exception_C(JavaThread* thread) 3858 3859 // At a method handle call, the stack may not be properly aligned 3860 // when returning with an exception. 3861 address the_pc = __ pc(); 3862 __ set_last_Java_frame(noreg, noreg, the_pc); 3863 __ mov(c_rarg0, r15_thread); 3864 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack 3865 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C))); 3866 3867 // Set an oopmap for the call site. This oopmap will only be used if we 3868 // are unwinding the stack. Hence, all locations will be dead. 3869 // Callee-saved registers will be the same as the frame above (i.e., 3870 // handle_exception_stub), since they were restored when we got the 3871 // exception. 3872 3873 OopMapSet* oop_maps = new OopMapSet(); 3874 3875 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 3876 3877 __ reset_last_Java_frame(false); 3878 3879 // Restore callee-saved registers 3880 3881 // rbp is an implicitly saved callee-saved register (i.e., the calling 3882 // convention will save restore it in prolog/epilog) Other than that 3883 // there are no callee save registers now that adapter frames are gone. 3884 3885 __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt)); 3886 3887 __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog 3888 __ pop(rdx); // No need for exception pc anymore 3889 3890 // rax: exception handler 3891 3892 // We have a handler in rax (could be deopt blob). 3893 __ mov(r8, rax); 3894 3895 // Get the exception oop 3896 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 3897 // Get the exception pc in case we are deoptimized 3898 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 3899 #ifdef ASSERT 3900 __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD); 3901 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD); 3902 #endif 3903 // Clear the exception oop so GC no longer processes it as a root. 3904 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD); 3905 3906 // rax: exception oop 3907 // r8: exception handler 3908 // rdx: exception pc 3909 // Jump to handler 3910 3911 __ jmp(r8); 3912 3913 // Make sure all code is generated 3914 masm->flush(); 3915 3916 // Set exception blob 3917 _exception_blob = ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1); 3918 } 3919 #endif // COMPILER2 3920 3921 void SharedRuntime::compute_move_order(const BasicType* in_sig_bt, 3922 int total_in_args, const VMRegPair* in_regs, 3923 int total_out_args, VMRegPair* out_regs, 3924 GrowableArray<int>& arg_order, 3925 VMRegPair tmp_vmreg) { 3926 ComputeMoveOrder order(total_in_args, in_regs, 3927 total_out_args, out_regs, 3928 in_sig_bt, arg_order, tmp_vmreg); 3929 }