1 /* 2 * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #ifndef _WINDOWS 27 #include "alloca.h" 28 #endif 29 #include "asm/macroAssembler.hpp" 30 #include "asm/macroAssembler.inline.hpp" 31 #include "code/debugInfoRec.hpp" 32 #include "code/icBuffer.hpp" 33 #include "code/nativeInst.hpp" 34 #include "code/vtableStubs.hpp" 35 #include "compiler/oopMap.hpp" 36 #include "gc/shared/collectedHeap.hpp" 37 #include "gc/shared/gcLocker.hpp" 38 #include "gc/shared/barrierSet.hpp" 39 #include "gc/shared/barrierSetAssembler.hpp" 40 #include "interpreter/interpreter.hpp" 41 #include "logging/log.hpp" 42 #include "memory/resourceArea.hpp" 43 #include "memory/universe.hpp" 44 #include "oops/compiledICHolder.hpp" 45 #include "oops/klass.inline.hpp" 46 #include "prims/methodHandles.hpp" 47 #include "runtime/jniHandles.hpp" 48 #include "runtime/safepointMechanism.hpp" 49 #include "runtime/sharedRuntime.hpp" 50 #include "runtime/signature.hpp" 51 #include "runtime/stubRoutines.hpp" 52 #include "runtime/vframeArray.hpp" 53 #include "runtime/vm_version.hpp" 54 #include "utilities/align.hpp" 55 #include "utilities/formatBuffer.hpp" 56 #include "vmreg_x86.inline.hpp" 57 #ifdef COMPILER1 58 #include "c1/c1_Runtime1.hpp" 59 #endif 60 #ifdef COMPILER2 61 #include "opto/runtime.hpp" 62 #endif 63 #if INCLUDE_JVMCI 64 #include "jvmci/jvmciJavaClasses.hpp" 65 #endif 66 67 #define __ masm-> 68 69 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size; 70 71 class SimpleRuntimeFrame { 72 73 public: 74 75 // Most of the runtime stubs have this simple frame layout. 76 // This class exists to make the layout shared in one place. 77 // Offsets are for compiler stack slots, which are jints. 78 enum layout { 79 // The frame sender code expects that rbp will be in the "natural" place and 80 // will override any oopMap setting for it. We must therefore force the layout 81 // so that it agrees with the frame sender code. 82 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt, 83 rbp_off2, 84 return_off, return_off2, 85 framesize 86 }; 87 }; 88 89 class RegisterSaver { 90 // Capture info about frame layout. Layout offsets are in jint 91 // units because compiler frame slots are jints. 92 #define XSAVE_AREA_BEGIN 160 93 #define XSAVE_AREA_YMM_BEGIN 576 94 #define XSAVE_AREA_OPMASK_BEGIN 1088 95 #define XSAVE_AREA_ZMM_BEGIN 1152 96 #define XSAVE_AREA_UPPERBANK 1664 97 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off 98 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off 99 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off 100 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off 101 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off 102 enum layout { 103 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area 104 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area 105 DEF_XMM_OFFS(0), 106 DEF_XMM_OFFS(1), 107 // 2..15 are implied in range usage 108 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 109 DEF_YMM_OFFS(0), 110 DEF_YMM_OFFS(1), 111 // 2..15 are implied in range usage 112 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 113 DEF_OPMASK_OFFS(0), 114 DEF_OPMASK_OFFS(1), 115 // 2..7 are implied in range usage 116 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 117 DEF_ZMM_OFFS(0), 118 DEF_ZMM_OFFS(1), 119 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt, 120 DEF_ZMM_UPPER_OFFS(16), 121 DEF_ZMM_UPPER_OFFS(17), 122 // 18..31 are implied in range usage 123 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt), 124 fpu_stateH_end, 125 r15_off, r15H_off, 126 r14_off, r14H_off, 127 r13_off, r13H_off, 128 r12_off, r12H_off, 129 r11_off, r11H_off, 130 r10_off, r10H_off, 131 r9_off, r9H_off, 132 r8_off, r8H_off, 133 rdi_off, rdiH_off, 134 rsi_off, rsiH_off, 135 ignore_off, ignoreH_off, // extra copy of rbp 136 rsp_off, rspH_off, 137 rbx_off, rbxH_off, 138 rdx_off, rdxH_off, 139 rcx_off, rcxH_off, 140 rax_off, raxH_off, 141 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state 142 align_off, alignH_off, 143 flags_off, flagsH_off, 144 // The frame sender code expects that rbp will be in the "natural" place and 145 // will override any oopMap setting for it. We must therefore force the layout 146 // so that it agrees with the frame sender code. 147 rbp_off, rbpH_off, // copy of rbp we will restore 148 return_off, returnH_off, // slot for return address 149 reg_save_size // size in compiler stack slots 150 }; 151 152 public: 153 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors); 154 static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false); 155 156 // Offsets into the register save area 157 // Used by deoptimization when it is managing result register 158 // values on its own 159 160 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; } 161 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; } 162 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; } 163 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; } 164 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; } 165 166 // During deoptimization only the result registers need to be restored, 167 // all the other values have already been extracted. 168 static void restore_result_registers(MacroAssembler* masm); 169 }; 170 171 // Register is a class, but it would be assigned numerical value. 172 // "0" is assigned for rax. Thus we need to ignore -Wnonnull. 173 PRAGMA_DIAG_PUSH 174 PRAGMA_NONNULL_IGNORED 175 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) { 176 int off = 0; 177 int num_xmm_regs = XMMRegisterImpl::available_xmm_registers(); 178 #if COMPILER2_OR_JVMCI 179 if (save_vectors && UseAVX == 0) { 180 save_vectors = false; // vectors larger than 16 byte long are supported only with AVX 181 } 182 assert(!save_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 183 #else 184 save_vectors = false; // vectors are generated only by C2 and JVMCI 185 #endif 186 187 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated 188 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs); 189 // OopMap frame size is in compiler stack slots (jint's) not bytes or words 190 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 191 // CodeBlob frame size is in words. 192 int frame_size_in_words = frame_size_in_bytes / wordSize; 193 *total_frame_words = frame_size_in_words; 194 195 // Save registers, fpu state, and flags. 196 // We assume caller has already pushed the return address onto the 197 // stack, so rsp is 8-byte aligned here. 198 // We push rpb twice in this sequence because we want the real rbp 199 // to be under the return like a normal enter. 200 201 __ enter(); // rsp becomes 16-byte aligned here 202 __ push_CPU_state(); // Push a multiple of 16 bytes 203 204 // push cpu state handles this on EVEX enabled targets 205 if (save_vectors) { 206 // Save upper half of YMM registers(0..15) 207 int base_addr = XSAVE_AREA_YMM_BEGIN; 208 for (int n = 0; n < 16; n++) { 209 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n)); 210 } 211 if (VM_Version::supports_evex()) { 212 // Save upper half of ZMM registers(0..15) 213 base_addr = XSAVE_AREA_ZMM_BEGIN; 214 for (int n = 0; n < 16; n++) { 215 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n)); 216 } 217 // Save full ZMM registers(16..num_xmm_regs) 218 base_addr = XSAVE_AREA_UPPERBANK; 219 off = 0; 220 int vector_len = Assembler::AVX_512bit; 221 for (int n = 16; n < num_xmm_regs; n++) { 222 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 223 } 224 #if COMPILER2_OR_JVMCI 225 base_addr = XSAVE_AREA_OPMASK_BEGIN; 226 off = 0; 227 for(int n = 0; n < KRegisterImpl::number_of_registers; n++) { 228 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 229 } 230 #endif 231 } 232 } else { 233 if (VM_Version::supports_evex()) { 234 // Save upper bank of ZMM registers(16..31) for double/float usage 235 int base_addr = XSAVE_AREA_UPPERBANK; 236 off = 0; 237 for (int n = 16; n < num_xmm_regs; n++) { 238 __ movsd(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n)); 239 } 240 #if COMPILER2_OR_JVMCI 241 base_addr = XSAVE_AREA_OPMASK_BEGIN; 242 off = 0; 243 for(int n = 0; n < KRegisterImpl::number_of_registers; n++) { 244 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 245 } 246 #endif 247 } 248 } 249 __ vzeroupper(); 250 if (frame::arg_reg_save_area_bytes != 0) { 251 // Allocate argument register save area 252 __ subptr(rsp, frame::arg_reg_save_area_bytes); 253 } 254 255 // Set an oopmap for the call site. This oopmap will map all 256 // oop-registers and debug-info registers as callee-saved. This 257 // will allow deoptimization at this safepoint to find all possible 258 // debug-info recordings, as well as let GC find all oops. 259 260 OopMapSet *oop_maps = new OopMapSet(); 261 OopMap* map = new OopMap(frame_size_in_slots, 0); 262 263 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x)) 264 265 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg()); 266 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg()); 267 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg()); 268 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg()); 269 // rbp location is known implicitly by the frame sender code, needs no oopmap 270 // and the location where rbp was saved by is ignored 271 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg()); 272 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg()); 273 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg()); 274 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg()); 275 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg()); 276 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg()); 277 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg()); 278 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg()); 279 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg()); 280 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg()); 281 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 282 // on EVEX enabled targets, we get it included in the xsave area 283 off = xmm0_off; 284 int delta = xmm1_off - off; 285 for (int n = 0; n < 16; n++) { 286 XMMRegister xmm_name = as_XMMRegister(n); 287 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()); 288 off += delta; 289 } 290 if (UseAVX > 2) { 291 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 292 off = zmm16_off; 293 delta = zmm17_off - off; 294 for (int n = 16; n < num_xmm_regs; n++) { 295 XMMRegister zmm_name = as_XMMRegister(n); 296 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()); 297 off += delta; 298 } 299 } 300 301 #if COMPILER2_OR_JVMCI 302 if (save_vectors) { 303 // Save upper half of YMM registers(0..15) 304 off = ymm0_off; 305 delta = ymm1_off - ymm0_off; 306 for (int n = 0; n < 16; n++) { 307 XMMRegister ymm_name = as_XMMRegister(n); 308 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4)); 309 off += delta; 310 } 311 if (VM_Version::supports_evex()) { 312 // Save upper half of ZMM registers(0..15) 313 off = zmm0_off; 314 delta = zmm1_off - zmm0_off; 315 for (int n = 0; n < 16; n++) { 316 XMMRegister zmm_name = as_XMMRegister(n); 317 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8)); 318 off += delta; 319 } 320 } 321 } 322 #endif // COMPILER2_OR_JVMCI 323 324 // %%% These should all be a waste but we'll keep things as they were for now 325 if (true) { 326 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next()); 327 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next()); 328 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next()); 329 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next()); 330 // rbp location is known implicitly by the frame sender code, needs no oopmap 331 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next()); 332 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next()); 333 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next()); 334 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next()); 335 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next()); 336 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next()); 337 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next()); 338 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next()); 339 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next()); 340 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next()); 341 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 342 // on EVEX enabled targets, we get it included in the xsave area 343 off = xmm0H_off; 344 delta = xmm1H_off - off; 345 for (int n = 0; n < 16; n++) { 346 XMMRegister xmm_name = as_XMMRegister(n); 347 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next()); 348 off += delta; 349 } 350 if (UseAVX > 2) { 351 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 352 off = zmm16H_off; 353 delta = zmm17H_off - off; 354 for (int n = 16; n < num_xmm_regs; n++) { 355 XMMRegister zmm_name = as_XMMRegister(n); 356 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next()); 357 off += delta; 358 } 359 } 360 } 361 362 return map; 363 } 364 PRAGMA_DIAG_POP 365 366 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) { 367 int num_xmm_regs = XMMRegisterImpl::available_xmm_registers(); 368 if (frame::arg_reg_save_area_bytes != 0) { 369 // Pop arg register save area 370 __ addptr(rsp, frame::arg_reg_save_area_bytes); 371 } 372 373 #if COMPILER2_OR_JVMCI 374 if (restore_vectors) { 375 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); 376 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 377 } 378 #else 379 assert(!restore_vectors, "vectors are generated only by C2"); 380 #endif 381 382 __ vzeroupper(); 383 384 // On EVEX enabled targets everything is handled in pop fpu state 385 if (restore_vectors) { 386 // Restore upper half of YMM registers (0..15) 387 int base_addr = XSAVE_AREA_YMM_BEGIN; 388 for (int n = 0; n < 16; n++) { 389 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16)); 390 } 391 if (VM_Version::supports_evex()) { 392 // Restore upper half of ZMM registers (0..15) 393 base_addr = XSAVE_AREA_ZMM_BEGIN; 394 for (int n = 0; n < 16; n++) { 395 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32)); 396 } 397 // Restore full ZMM registers(16..num_xmm_regs) 398 base_addr = XSAVE_AREA_UPPERBANK; 399 int vector_len = Assembler::AVX_512bit; 400 int off = 0; 401 for (int n = 16; n < num_xmm_regs; n++) { 402 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 403 } 404 #if COMPILER2_OR_JVMCI 405 base_addr = XSAVE_AREA_OPMASK_BEGIN; 406 off = 0; 407 for (int n = 0; n < KRegisterImpl::number_of_registers; n++) { 408 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 409 } 410 #endif 411 } 412 } else { 413 if (VM_Version::supports_evex()) { 414 // Restore upper bank of ZMM registers(16..31) for double/float usage 415 int base_addr = XSAVE_AREA_UPPERBANK; 416 int off = 0; 417 for (int n = 16; n < num_xmm_regs; n++) { 418 __ movsd(as_XMMRegister(n), Address(rsp, base_addr+(off++*64))); 419 } 420 #if COMPILER2_OR_JVMCI 421 base_addr = XSAVE_AREA_OPMASK_BEGIN; 422 off = 0; 423 for (int n = 0; n < KRegisterImpl::number_of_registers; n++) { 424 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 425 } 426 #endif 427 } 428 } 429 430 // Recover CPU state 431 __ pop_CPU_state(); 432 // Get the rbp described implicitly by the calling convention (no oopMap) 433 __ pop(rbp); 434 } 435 436 void RegisterSaver::restore_result_registers(MacroAssembler* masm) { 437 438 // Just restore result register. Only used by deoptimization. By 439 // now any callee save register that needs to be restored to a c2 440 // caller of the deoptee has been extracted into the vframeArray 441 // and will be stuffed into the c2i adapter we create for later 442 // restoration so only result registers need to be restored here. 443 444 // Restore fp result register 445 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes())); 446 // Restore integer result register 447 __ movptr(rax, Address(rsp, rax_offset_in_bytes())); 448 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes())); 449 450 // Pop all of the register save are off the stack except the return address 451 __ addptr(rsp, return_offset_in_bytes()); 452 } 453 454 // Is vector's size (in bytes) bigger than a size saved by default? 455 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions. 456 bool SharedRuntime::is_wide_vector(int size) { 457 return size > 16; 458 } 459 460 // --------------------------------------------------------------------------- 461 // Read the array of BasicTypes from a signature, and compute where the 462 // arguments should go. Values in the VMRegPair regs array refer to 4-byte 463 // quantities. Values less than VMRegImpl::stack0 are registers, those above 464 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer 465 // as framesizes are fixed. 466 // VMRegImpl::stack0 refers to the first slot 0(sp). 467 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher. Register 468 // up to RegisterImpl::number_of_registers) are the 64-bit 469 // integer registers. 470 471 // Note: the INPUTS in sig_bt are in units of Java argument words, which are 472 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit 473 // units regardless of build. Of course for i486 there is no 64 bit build 474 475 // The Java calling convention is a "shifted" version of the C ABI. 476 // By skipping the first C ABI register we can call non-static jni methods 477 // with small numbers of arguments without having to shuffle the arguments 478 // at all. Since we control the java ABI we ought to at least get some 479 // advantage out of it. 480 481 int SharedRuntime::java_calling_convention(const BasicType *sig_bt, 482 VMRegPair *regs, 483 int total_args_passed) { 484 485 // Create the mapping between argument positions and 486 // registers. 487 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = { 488 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5 489 }; 490 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = { 491 j_farg0, j_farg1, j_farg2, j_farg3, 492 j_farg4, j_farg5, j_farg6, j_farg7 493 }; 494 495 496 uint int_args = 0; 497 uint fp_args = 0; 498 uint stk_args = 0; // inc by 2 each time 499 500 for (int i = 0; i < total_args_passed; i++) { 501 switch (sig_bt[i]) { 502 case T_BOOLEAN: 503 case T_CHAR: 504 case T_BYTE: 505 case T_SHORT: 506 case T_INT: 507 if (int_args < Argument::n_int_register_parameters_j) { 508 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 509 } else { 510 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 511 stk_args += 2; 512 } 513 break; 514 case T_VOID: 515 // halves of T_LONG or T_DOUBLE 516 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 517 regs[i].set_bad(); 518 break; 519 case T_LONG: 520 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 521 // fall through 522 case T_OBJECT: 523 case T_ARRAY: 524 case T_ADDRESS: 525 if (int_args < Argument::n_int_register_parameters_j) { 526 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 527 } else { 528 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 529 stk_args += 2; 530 } 531 break; 532 case T_FLOAT: 533 if (fp_args < Argument::n_float_register_parameters_j) { 534 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 535 } else { 536 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 537 stk_args += 2; 538 } 539 break; 540 case T_DOUBLE: 541 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 542 if (fp_args < Argument::n_float_register_parameters_j) { 543 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 544 } else { 545 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 546 stk_args += 2; 547 } 548 break; 549 default: 550 ShouldNotReachHere(); 551 break; 552 } 553 } 554 555 return align_up(stk_args, 2); 556 } 557 558 // Patch the callers callsite with entry to compiled code if it exists. 559 static void patch_callers_callsite(MacroAssembler *masm) { 560 Label L; 561 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD); 562 __ jcc(Assembler::equal, L); 563 564 // Save the current stack pointer 565 __ mov(r13, rsp); 566 // Schedule the branch target address early. 567 // Call into the VM to patch the caller, then jump to compiled callee 568 // rax isn't live so capture return address while we easily can 569 __ movptr(rax, Address(rsp, 0)); 570 571 // align stack so push_CPU_state doesn't fault 572 __ andptr(rsp, -(StackAlignmentInBytes)); 573 __ push_CPU_state(); 574 __ vzeroupper(); 575 // VM needs caller's callsite 576 // VM needs target method 577 // This needs to be a long call since we will relocate this adapter to 578 // the codeBuffer and it may not reach 579 580 // Allocate argument register save area 581 if (frame::arg_reg_save_area_bytes != 0) { 582 __ subptr(rsp, frame::arg_reg_save_area_bytes); 583 } 584 __ mov(c_rarg0, rbx); 585 __ mov(c_rarg1, rax); 586 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite))); 587 588 // De-allocate argument register save area 589 if (frame::arg_reg_save_area_bytes != 0) { 590 __ addptr(rsp, frame::arg_reg_save_area_bytes); 591 } 592 593 __ vzeroupper(); 594 __ pop_CPU_state(); 595 // restore sp 596 __ mov(rsp, r13); 597 __ bind(L); 598 } 599 600 601 static void gen_c2i_adapter(MacroAssembler *masm, 602 int total_args_passed, 603 int comp_args_on_stack, 604 const BasicType *sig_bt, 605 const VMRegPair *regs, 606 Label& skip_fixup) { 607 // Before we get into the guts of the C2I adapter, see if we should be here 608 // at all. We've come from compiled code and are attempting to jump to the 609 // interpreter, which means the caller made a static call to get here 610 // (vcalls always get a compiled target if there is one). Check for a 611 // compiled target. If there is one, we need to patch the caller's call. 612 patch_callers_callsite(masm); 613 614 __ bind(skip_fixup); 615 616 // Since all args are passed on the stack, total_args_passed * 617 // Interpreter::stackElementSize is the space we need. Plus 1 because 618 // we also account for the return address location since 619 // we store it first rather than hold it in rax across all the shuffling 620 621 int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize; 622 623 // stack is aligned, keep it that way 624 extraspace = align_up(extraspace, 2*wordSize); 625 626 // Get return address 627 __ pop(rax); 628 629 // set senderSP value 630 __ mov(r13, rsp); 631 632 __ subptr(rsp, extraspace); 633 634 // Store the return address in the expected location 635 __ movptr(Address(rsp, 0), rax); 636 637 // Now write the args into the outgoing interpreter space 638 for (int i = 0; i < total_args_passed; i++) { 639 if (sig_bt[i] == T_VOID) { 640 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 641 continue; 642 } 643 644 // offset to start parameters 645 int st_off = (total_args_passed - i) * Interpreter::stackElementSize; 646 int next_off = st_off - Interpreter::stackElementSize; 647 648 // Say 4 args: 649 // i st_off 650 // 0 32 T_LONG 651 // 1 24 T_VOID 652 // 2 16 T_OBJECT 653 // 3 8 T_BOOL 654 // - 0 return address 655 // 656 // However to make thing extra confusing. Because we can fit a long/double in 657 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter 658 // leaves one slot empty and only stores to a single slot. In this case the 659 // slot that is occupied is the T_VOID slot. See I said it was confusing. 660 661 VMReg r_1 = regs[i].first(); 662 VMReg r_2 = regs[i].second(); 663 if (!r_1->is_valid()) { 664 assert(!r_2->is_valid(), ""); 665 continue; 666 } 667 if (r_1->is_stack()) { 668 // memory to memory use rax 669 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace; 670 if (!r_2->is_valid()) { 671 // sign extend?? 672 __ movl(rax, Address(rsp, ld_off)); 673 __ movptr(Address(rsp, st_off), rax); 674 675 } else { 676 677 __ movq(rax, Address(rsp, ld_off)); 678 679 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 680 // T_DOUBLE and T_LONG use two slots in the interpreter 681 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 682 // ld_off == LSW, ld_off+wordSize == MSW 683 // st_off == MSW, next_off == LSW 684 __ movq(Address(rsp, next_off), rax); 685 #ifdef ASSERT 686 // Overwrite the unused slot with known junk 687 __ mov64(rax, CONST64(0xdeadffffdeadaaaa)); 688 __ movptr(Address(rsp, st_off), rax); 689 #endif /* ASSERT */ 690 } else { 691 __ movq(Address(rsp, st_off), rax); 692 } 693 } 694 } else if (r_1->is_Register()) { 695 Register r = r_1->as_Register(); 696 if (!r_2->is_valid()) { 697 // must be only an int (or less ) so move only 32bits to slot 698 // why not sign extend?? 699 __ movl(Address(rsp, st_off), r); 700 } else { 701 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 702 // T_DOUBLE and T_LONG use two slots in the interpreter 703 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 704 // long/double in gpr 705 #ifdef ASSERT 706 // Overwrite the unused slot with known junk 707 __ mov64(rax, CONST64(0xdeadffffdeadaaab)); 708 __ movptr(Address(rsp, st_off), rax); 709 #endif /* ASSERT */ 710 __ movq(Address(rsp, next_off), r); 711 } else { 712 __ movptr(Address(rsp, st_off), r); 713 } 714 } 715 } else { 716 assert(r_1->is_XMMRegister(), ""); 717 if (!r_2->is_valid()) { 718 // only a float use just part of the slot 719 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister()); 720 } else { 721 #ifdef ASSERT 722 // Overwrite the unused slot with known junk 723 __ mov64(rax, CONST64(0xdeadffffdeadaaac)); 724 __ movptr(Address(rsp, st_off), rax); 725 #endif /* ASSERT */ 726 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister()); 727 } 728 } 729 } 730 731 // Schedule the branch target address early. 732 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset()))); 733 __ jmp(rcx); 734 } 735 736 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg, 737 address code_start, address code_end, 738 Label& L_ok) { 739 Label L_fail; 740 __ lea(temp_reg, ExternalAddress(code_start)); 741 __ cmpptr(pc_reg, temp_reg); 742 __ jcc(Assembler::belowEqual, L_fail); 743 __ lea(temp_reg, ExternalAddress(code_end)); 744 __ cmpptr(pc_reg, temp_reg); 745 __ jcc(Assembler::below, L_ok); 746 __ bind(L_fail); 747 } 748 749 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm, 750 int total_args_passed, 751 int comp_args_on_stack, 752 const BasicType *sig_bt, 753 const VMRegPair *regs) { 754 755 // Note: r13 contains the senderSP on entry. We must preserve it since 756 // we may do a i2c -> c2i transition if we lose a race where compiled 757 // code goes non-entrant while we get args ready. 758 // In addition we use r13 to locate all the interpreter args as 759 // we must align the stack to 16 bytes on an i2c entry else we 760 // lose alignment we expect in all compiled code and register 761 // save code can segv when fxsave instructions find improperly 762 // aligned stack pointer. 763 764 // Adapters can be frameless because they do not require the caller 765 // to perform additional cleanup work, such as correcting the stack pointer. 766 // An i2c adapter is frameless because the *caller* frame, which is interpreted, 767 // routinely repairs its own stack pointer (from interpreter_frame_last_sp), 768 // even if a callee has modified the stack pointer. 769 // A c2i adapter is frameless because the *callee* frame, which is interpreted, 770 // routinely repairs its caller's stack pointer (from sender_sp, which is set 771 // up via the senderSP register). 772 // In other words, if *either* the caller or callee is interpreted, we can 773 // get the stack pointer repaired after a call. 774 // This is why c2i and i2c adapters cannot be indefinitely composed. 775 // In particular, if a c2i adapter were to somehow call an i2c adapter, 776 // both caller and callee would be compiled methods, and neither would 777 // clean up the stack pointer changes performed by the two adapters. 778 // If this happens, control eventually transfers back to the compiled 779 // caller, but with an uncorrected stack, causing delayed havoc. 780 781 // Pick up the return address 782 __ movptr(rax, Address(rsp, 0)); 783 784 if (VerifyAdapterCalls && 785 (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) { 786 // So, let's test for cascading c2i/i2c adapters right now. 787 // assert(Interpreter::contains($return_addr) || 788 // StubRoutines::contains($return_addr), 789 // "i2c adapter must return to an interpreter frame"); 790 __ block_comment("verify_i2c { "); 791 Label L_ok; 792 if (Interpreter::code() != NULL) 793 range_check(masm, rax, r11, 794 Interpreter::code()->code_start(), Interpreter::code()->code_end(), 795 L_ok); 796 if (StubRoutines::code1() != NULL) 797 range_check(masm, rax, r11, 798 StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(), 799 L_ok); 800 if (StubRoutines::code2() != NULL) 801 range_check(masm, rax, r11, 802 StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(), 803 L_ok); 804 const char* msg = "i2c adapter must return to an interpreter frame"; 805 __ block_comment(msg); 806 __ stop(msg); 807 __ bind(L_ok); 808 __ block_comment("} verify_i2ce "); 809 } 810 811 // Must preserve original SP for loading incoming arguments because 812 // we need to align the outgoing SP for compiled code. 813 __ movptr(r11, rsp); 814 815 // Cut-out for having no stack args. Since up to 2 int/oop args are passed 816 // in registers, we will occasionally have no stack args. 817 int comp_words_on_stack = 0; 818 if (comp_args_on_stack) { 819 // Sig words on the stack are greater-than VMRegImpl::stack0. Those in 820 // registers are below. By subtracting stack0, we either get a negative 821 // number (all values in registers) or the maximum stack slot accessed. 822 823 // Convert 4-byte c2 stack slots to words. 824 comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord; 825 // Round up to miminum stack alignment, in wordSize 826 comp_words_on_stack = align_up(comp_words_on_stack, 2); 827 __ subptr(rsp, comp_words_on_stack * wordSize); 828 } 829 830 831 // Ensure compiled code always sees stack at proper alignment 832 __ andptr(rsp, -16); 833 834 // push the return address and misalign the stack that youngest frame always sees 835 // as far as the placement of the call instruction 836 __ push(rax); 837 838 // Put saved SP in another register 839 const Register saved_sp = rax; 840 __ movptr(saved_sp, r11); 841 842 // Will jump to the compiled code just as if compiled code was doing it. 843 // Pre-load the register-jump target early, to schedule it better. 844 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset()))); 845 846 #if INCLUDE_JVMCI 847 if (EnableJVMCI) { 848 // check if this call should be routed towards a specific entry point 849 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 850 Label no_alternative_target; 851 __ jcc(Assembler::equal, no_alternative_target); 852 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset()))); 853 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 854 __ bind(no_alternative_target); 855 } 856 #endif // INCLUDE_JVMCI 857 858 // Now generate the shuffle code. Pick up all register args and move the 859 // rest through the floating point stack top. 860 for (int i = 0; i < total_args_passed; i++) { 861 if (sig_bt[i] == T_VOID) { 862 // Longs and doubles are passed in native word order, but misaligned 863 // in the 32-bit build. 864 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 865 continue; 866 } 867 868 // Pick up 0, 1 or 2 words from SP+offset. 869 870 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), 871 "scrambled load targets?"); 872 // Load in argument order going down. 873 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize; 874 // Point to interpreter value (vs. tag) 875 int next_off = ld_off - Interpreter::stackElementSize; 876 // 877 // 878 // 879 VMReg r_1 = regs[i].first(); 880 VMReg r_2 = regs[i].second(); 881 if (!r_1->is_valid()) { 882 assert(!r_2->is_valid(), ""); 883 continue; 884 } 885 if (r_1->is_stack()) { 886 // Convert stack slot to an SP offset (+ wordSize to account for return address ) 887 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize; 888 889 // We can use r13 as a temp here because compiled code doesn't need r13 as an input 890 // and if we end up going thru a c2i because of a miss a reasonable value of r13 891 // will be generated. 892 if (!r_2->is_valid()) { 893 // sign extend??? 894 __ movl(r13, Address(saved_sp, ld_off)); 895 __ movptr(Address(rsp, st_off), r13); 896 } else { 897 // 898 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 899 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 900 // So we must adjust where to pick up the data to match the interpreter. 901 // 902 // Interpreter local[n] == MSW, local[n+1] == LSW however locals 903 // are accessed as negative so LSW is at LOW address 904 905 // ld_off is MSW so get LSW 906 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 907 next_off : ld_off; 908 __ movq(r13, Address(saved_sp, offset)); 909 // st_off is LSW (i.e. reg.first()) 910 __ movq(Address(rsp, st_off), r13); 911 } 912 } else if (r_1->is_Register()) { // Register argument 913 Register r = r_1->as_Register(); 914 assert(r != rax, "must be different"); 915 if (r_2->is_valid()) { 916 // 917 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 918 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 919 // So we must adjust where to pick up the data to match the interpreter. 920 921 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 922 next_off : ld_off; 923 924 // this can be a misaligned move 925 __ movq(r, Address(saved_sp, offset)); 926 } else { 927 // sign extend and use a full word? 928 __ movl(r, Address(saved_sp, ld_off)); 929 } 930 } else { 931 if (!r_2->is_valid()) { 932 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off)); 933 } else { 934 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off)); 935 } 936 } 937 } 938 939 // 6243940 We might end up in handle_wrong_method if 940 // the callee is deoptimized as we race thru here. If that 941 // happens we don't want to take a safepoint because the 942 // caller frame will look interpreted and arguments are now 943 // "compiled" so it is much better to make this transition 944 // invisible to the stack walking code. Unfortunately if 945 // we try and find the callee by normal means a safepoint 946 // is possible. So we stash the desired callee in the thread 947 // and the vm will find there should this case occur. 948 949 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx); 950 951 // put Method* where a c2i would expect should we end up there 952 // only needed because eof c2 resolve stubs return Method* as a result in 953 // rax 954 __ mov(rax, rbx); 955 __ jmp(r11); 956 } 957 958 // --------------------------------------------------------------- 959 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm, 960 int total_args_passed, 961 int comp_args_on_stack, 962 const BasicType *sig_bt, 963 const VMRegPair *regs, 964 AdapterFingerPrint* fingerprint) { 965 address i2c_entry = __ pc(); 966 967 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs); 968 969 // ------------------------------------------------------------------------- 970 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls 971 // to the interpreter. The args start out packed in the compiled layout. They 972 // need to be unpacked into the interpreter layout. This will almost always 973 // require some stack space. We grow the current (compiled) stack, then repack 974 // the args. We finally end in a jump to the generic interpreter entry point. 975 // On exit from the interpreter, the interpreter will restore our SP (lest the 976 // compiled code, which relies solely on SP and not RBP, get sick). 977 978 address c2i_unverified_entry = __ pc(); 979 Label skip_fixup; 980 Label ok; 981 982 Register holder = rax; 983 Register receiver = j_rarg0; 984 Register temp = rbx; 985 986 { 987 __ load_klass(temp, receiver, rscratch1); 988 __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset())); 989 __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset())); 990 __ jcc(Assembler::equal, ok); 991 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 992 993 __ bind(ok); 994 // Method might have been compiled since the call site was patched to 995 // interpreted if that is the case treat it as a miss so we can get 996 // the call site corrected. 997 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD); 998 __ jcc(Assembler::equal, skip_fixup); 999 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1000 } 1001 1002 address c2i_entry = __ pc(); 1003 1004 // Class initialization barrier for static methods 1005 address c2i_no_clinit_check_entry = NULL; 1006 if (VM_Version::supports_fast_class_init_checks()) { 1007 Label L_skip_barrier; 1008 Register method = rbx; 1009 1010 { // Bypass the barrier for non-static methods 1011 Register flags = rscratch1; 1012 __ movl(flags, Address(method, Method::access_flags_offset())); 1013 __ testl(flags, JVM_ACC_STATIC); 1014 __ jcc(Assembler::zero, L_skip_barrier); // non-static 1015 } 1016 1017 Register klass = rscratch1; 1018 __ load_method_holder(klass, method); 1019 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1020 1021 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1022 1023 __ bind(L_skip_barrier); 1024 c2i_no_clinit_check_entry = __ pc(); 1025 } 1026 1027 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1028 bs->c2i_entry_barrier(masm); 1029 1030 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup); 1031 1032 __ flush(); 1033 return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry); 1034 } 1035 1036 int SharedRuntime::c_calling_convention(const BasicType *sig_bt, 1037 VMRegPair *regs, 1038 VMRegPair *regs2, 1039 int total_args_passed) { 1040 assert(regs2 == NULL, "not needed on x86"); 1041 // We return the amount of VMRegImpl stack slots we need to reserve for all 1042 // the arguments NOT counting out_preserve_stack_slots. 1043 1044 // NOTE: These arrays will have to change when c1 is ported 1045 #ifdef _WIN64 1046 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1047 c_rarg0, c_rarg1, c_rarg2, c_rarg3 1048 }; 1049 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1050 c_farg0, c_farg1, c_farg2, c_farg3 1051 }; 1052 #else 1053 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1054 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5 1055 }; 1056 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1057 c_farg0, c_farg1, c_farg2, c_farg3, 1058 c_farg4, c_farg5, c_farg6, c_farg7 1059 }; 1060 #endif // _WIN64 1061 1062 1063 uint int_args = 0; 1064 uint fp_args = 0; 1065 uint stk_args = 0; // inc by 2 each time 1066 1067 for (int i = 0; i < total_args_passed; i++) { 1068 switch (sig_bt[i]) { 1069 case T_BOOLEAN: 1070 case T_CHAR: 1071 case T_BYTE: 1072 case T_SHORT: 1073 case T_INT: 1074 if (int_args < Argument::n_int_register_parameters_c) { 1075 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 1076 #ifdef _WIN64 1077 fp_args++; 1078 // Allocate slots for callee to stuff register args the stack. 1079 stk_args += 2; 1080 #endif 1081 } else { 1082 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1083 stk_args += 2; 1084 } 1085 break; 1086 case T_LONG: 1087 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1088 // fall through 1089 case T_OBJECT: 1090 case T_ARRAY: 1091 case T_ADDRESS: 1092 case T_METADATA: 1093 if (int_args < Argument::n_int_register_parameters_c) { 1094 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 1095 #ifdef _WIN64 1096 fp_args++; 1097 stk_args += 2; 1098 #endif 1099 } else { 1100 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1101 stk_args += 2; 1102 } 1103 break; 1104 case T_FLOAT: 1105 if (fp_args < Argument::n_float_register_parameters_c) { 1106 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 1107 #ifdef _WIN64 1108 int_args++; 1109 // Allocate slots for callee to stuff register args the stack. 1110 stk_args += 2; 1111 #endif 1112 } else { 1113 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1114 stk_args += 2; 1115 } 1116 break; 1117 case T_DOUBLE: 1118 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1119 if (fp_args < Argument::n_float_register_parameters_c) { 1120 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 1121 #ifdef _WIN64 1122 int_args++; 1123 // Allocate slots for callee to stuff register args the stack. 1124 stk_args += 2; 1125 #endif 1126 } else { 1127 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1128 stk_args += 2; 1129 } 1130 break; 1131 case T_VOID: // Halves of longs and doubles 1132 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 1133 regs[i].set_bad(); 1134 break; 1135 default: 1136 ShouldNotReachHere(); 1137 break; 1138 } 1139 } 1140 #ifdef _WIN64 1141 // windows abi requires that we always allocate enough stack space 1142 // for 4 64bit registers to be stored down. 1143 if (stk_args < 8) { 1144 stk_args = 8; 1145 } 1146 #endif // _WIN64 1147 1148 return stk_args; 1149 } 1150 1151 int SharedRuntime::vector_calling_convention(VMRegPair *regs, 1152 uint num_bits, 1153 uint total_args_passed) { 1154 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512, 1155 "only certain vector sizes are supported for now"); 1156 1157 static const XMMRegister VEC_ArgReg[32] = { 1158 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 1159 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1160 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23, 1161 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31 1162 }; 1163 1164 uint stk_args = 0; 1165 uint fp_args = 0; 1166 1167 for (uint i = 0; i < total_args_passed; i++) { 1168 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg(); 1169 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15)); 1170 regs[i].set_pair(vmreg->next(next_val), vmreg); 1171 } 1172 1173 return stk_args; 1174 } 1175 1176 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1177 // We always ignore the frame_slots arg and just use the space just below frame pointer 1178 // which by this time is free to use 1179 switch (ret_type) { 1180 case T_FLOAT: 1181 __ movflt(Address(rbp, -wordSize), xmm0); 1182 break; 1183 case T_DOUBLE: 1184 __ movdbl(Address(rbp, -wordSize), xmm0); 1185 break; 1186 case T_VOID: break; 1187 default: { 1188 __ movptr(Address(rbp, -wordSize), rax); 1189 } 1190 } 1191 } 1192 1193 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1194 // We always ignore the frame_slots arg and just use the space just below frame pointer 1195 // which by this time is free to use 1196 switch (ret_type) { 1197 case T_FLOAT: 1198 __ movflt(xmm0, Address(rbp, -wordSize)); 1199 break; 1200 case T_DOUBLE: 1201 __ movdbl(xmm0, Address(rbp, -wordSize)); 1202 break; 1203 case T_VOID: break; 1204 default: { 1205 __ movptr(rax, Address(rbp, -wordSize)); 1206 } 1207 } 1208 } 1209 1210 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1211 for ( int i = first_arg ; i < arg_count ; i++ ) { 1212 if (args[i].first()->is_Register()) { 1213 __ push(args[i].first()->as_Register()); 1214 } else if (args[i].first()->is_XMMRegister()) { 1215 __ subptr(rsp, 2*wordSize); 1216 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister()); 1217 } 1218 } 1219 } 1220 1221 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1222 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) { 1223 if (args[i].first()->is_Register()) { 1224 __ pop(args[i].first()->as_Register()); 1225 } else if (args[i].first()->is_XMMRegister()) { 1226 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0)); 1227 __ addptr(rsp, 2*wordSize); 1228 } 1229 } 1230 } 1231 1232 // Different signatures may require very different orders for the move 1233 // to avoid clobbering other arguments. There's no simple way to 1234 // order them safely. Compute a safe order for issuing stores and 1235 // break any cycles in those stores. This code is fairly general but 1236 // it's not necessary on the other platforms so we keep it in the 1237 // platform dependent code instead of moving it into a shared file. 1238 // (See bugs 7013347 & 7145024.) 1239 // Note that this code is specific to LP64. 1240 class ComputeMoveOrder: public StackObj { 1241 class MoveOperation: public ResourceObj { 1242 friend class ComputeMoveOrder; 1243 private: 1244 VMRegPair _src; 1245 VMRegPair _dst; 1246 int _src_index; 1247 int _dst_index; 1248 bool _processed; 1249 MoveOperation* _next; 1250 MoveOperation* _prev; 1251 1252 static int get_id(VMRegPair r) { 1253 return r.first()->value(); 1254 } 1255 1256 public: 1257 MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst): 1258 _src(src) 1259 , _dst(dst) 1260 , _src_index(src_index) 1261 , _dst_index(dst_index) 1262 , _processed(false) 1263 , _next(NULL) 1264 , _prev(NULL) { 1265 } 1266 1267 VMRegPair src() const { return _src; } 1268 int src_id() const { return get_id(src()); } 1269 int src_index() const { return _src_index; } 1270 VMRegPair dst() const { return _dst; } 1271 void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; } 1272 int dst_index() const { return _dst_index; } 1273 int dst_id() const { return get_id(dst()); } 1274 MoveOperation* next() const { return _next; } 1275 MoveOperation* prev() const { return _prev; } 1276 void set_processed() { _processed = true; } 1277 bool is_processed() const { return _processed; } 1278 1279 // insert 1280 void break_cycle(VMRegPair temp_register) { 1281 // create a new store following the last store 1282 // to move from the temp_register to the original 1283 MoveOperation* new_store = new MoveOperation(-1, temp_register, dst_index(), dst()); 1284 1285 // break the cycle of links and insert new_store at the end 1286 // break the reverse link. 1287 MoveOperation* p = prev(); 1288 assert(p->next() == this, "must be"); 1289 _prev = NULL; 1290 p->_next = new_store; 1291 new_store->_prev = p; 1292 1293 // change the original store to save it's value in the temp. 1294 set_dst(-1, temp_register); 1295 } 1296 1297 void link(GrowableArray<MoveOperation*>& killer) { 1298 // link this store in front the store that it depends on 1299 MoveOperation* n = killer.at_grow(src_id(), NULL); 1300 if (n != NULL) { 1301 assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet"); 1302 _next = n; 1303 n->_prev = this; 1304 } 1305 } 1306 }; 1307 1308 private: 1309 GrowableArray<MoveOperation*> edges; 1310 1311 public: 1312 ComputeMoveOrder(int total_in_args, const VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs, 1313 const BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) { 1314 // Move operations where the dest is the stack can all be 1315 // scheduled first since they can't interfere with the other moves. 1316 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 1317 if (in_sig_bt[i] == T_ARRAY) { 1318 c_arg--; 1319 if (out_regs[c_arg].first()->is_stack() && 1320 out_regs[c_arg + 1].first()->is_stack()) { 1321 arg_order.push(i); 1322 arg_order.push(c_arg); 1323 } else { 1324 if (out_regs[c_arg].first()->is_stack() || 1325 in_regs[i].first() == out_regs[c_arg].first()) { 1326 add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + 1]); 1327 } else { 1328 add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]); 1329 } 1330 } 1331 } else if (in_sig_bt[i] == T_VOID) { 1332 arg_order.push(i); 1333 arg_order.push(c_arg); 1334 } else { 1335 if (out_regs[c_arg].first()->is_stack() || 1336 in_regs[i].first() == out_regs[c_arg].first()) { 1337 arg_order.push(i); 1338 arg_order.push(c_arg); 1339 } else { 1340 add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]); 1341 } 1342 } 1343 } 1344 // Break any cycles in the register moves and emit the in the 1345 // proper order. 1346 GrowableArray<MoveOperation*>* stores = get_store_order(tmp_vmreg); 1347 for (int i = 0; i < stores->length(); i++) { 1348 arg_order.push(stores->at(i)->src_index()); 1349 arg_order.push(stores->at(i)->dst_index()); 1350 } 1351 } 1352 1353 // Collected all the move operations 1354 void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) { 1355 if (src.first() == dst.first()) return; 1356 edges.append(new MoveOperation(src_index, src, dst_index, dst)); 1357 } 1358 1359 // Walk the edges breaking cycles between moves. The result list 1360 // can be walked in order to produce the proper set of loads 1361 GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) { 1362 // Record which moves kill which values 1363 GrowableArray<MoveOperation*> killer; 1364 for (int i = 0; i < edges.length(); i++) { 1365 MoveOperation* s = edges.at(i); 1366 assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer"); 1367 killer.at_put_grow(s->dst_id(), s, NULL); 1368 } 1369 assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL, 1370 "make sure temp isn't in the registers that are killed"); 1371 1372 // create links between loads and stores 1373 for (int i = 0; i < edges.length(); i++) { 1374 edges.at(i)->link(killer); 1375 } 1376 1377 // at this point, all the move operations are chained together 1378 // in a doubly linked list. Processing it backwards finds 1379 // the beginning of the chain, forwards finds the end. If there's 1380 // a cycle it can be broken at any point, so pick an edge and walk 1381 // backward until the list ends or we end where we started. 1382 GrowableArray<MoveOperation*>* stores = new GrowableArray<MoveOperation*>(); 1383 for (int e = 0; e < edges.length(); e++) { 1384 MoveOperation* s = edges.at(e); 1385 if (!s->is_processed()) { 1386 MoveOperation* start = s; 1387 // search for the beginning of the chain or cycle 1388 while (start->prev() != NULL && start->prev() != s) { 1389 start = start->prev(); 1390 } 1391 if (start->prev() == s) { 1392 start->break_cycle(temp_register); 1393 } 1394 // walk the chain forward inserting to store list 1395 while (start != NULL) { 1396 stores->append(start); 1397 start->set_processed(); 1398 start = start->next(); 1399 } 1400 } 1401 } 1402 return stores; 1403 } 1404 }; 1405 1406 static void verify_oop_args(MacroAssembler* masm, 1407 const methodHandle& method, 1408 const BasicType* sig_bt, 1409 const VMRegPair* regs) { 1410 Register temp_reg = rbx; // not part of any compiled calling seq 1411 if (VerifyOops) { 1412 for (int i = 0; i < method->size_of_parameters(); i++) { 1413 if (is_reference_type(sig_bt[i])) { 1414 VMReg r = regs[i].first(); 1415 assert(r->is_valid(), "bad oop arg"); 1416 if (r->is_stack()) { 1417 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1418 __ verify_oop(temp_reg); 1419 } else { 1420 __ verify_oop(r->as_Register()); 1421 } 1422 } 1423 } 1424 } 1425 } 1426 1427 static void gen_special_dispatch(MacroAssembler* masm, 1428 const methodHandle& method, 1429 const BasicType* sig_bt, 1430 const VMRegPair* regs) { 1431 verify_oop_args(masm, method, sig_bt, regs); 1432 vmIntrinsics::ID iid = method->intrinsic_id(); 1433 1434 // Now write the args into the outgoing interpreter space 1435 bool has_receiver = false; 1436 Register receiver_reg = noreg; 1437 int member_arg_pos = -1; 1438 Register member_reg = noreg; 1439 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid); 1440 if (ref_kind != 0) { 1441 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument 1442 member_reg = rbx; // known to be free at this point 1443 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind); 1444 } else if (iid == vmIntrinsics::_invokeBasic || iid == vmIntrinsics::_linkToNative) { 1445 has_receiver = true; 1446 } else { 1447 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid)); 1448 } 1449 1450 if (member_reg != noreg) { 1451 // Load the member_arg into register, if necessary. 1452 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs); 1453 VMReg r = regs[member_arg_pos].first(); 1454 if (r->is_stack()) { 1455 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1456 } else { 1457 // no data motion is needed 1458 member_reg = r->as_Register(); 1459 } 1460 } 1461 1462 if (has_receiver) { 1463 // Make sure the receiver is loaded into a register. 1464 assert(method->size_of_parameters() > 0, "oob"); 1465 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object"); 1466 VMReg r = regs[0].first(); 1467 assert(r->is_valid(), "bad receiver arg"); 1468 if (r->is_stack()) { 1469 // Porting note: This assumes that compiled calling conventions always 1470 // pass the receiver oop in a register. If this is not true on some 1471 // platform, pick a temp and load the receiver from stack. 1472 fatal("receiver always in a register"); 1473 receiver_reg = j_rarg0; // known to be free at this point 1474 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1475 } else { 1476 // no data motion is needed 1477 receiver_reg = r->as_Register(); 1478 } 1479 } 1480 1481 // Figure out which address we are really jumping to: 1482 MethodHandles::generate_method_handle_dispatch(masm, iid, 1483 receiver_reg, member_reg, /*for_compiler_entry:*/ true); 1484 } 1485 1486 // --------------------------------------------------------------------------- 1487 // Generate a native wrapper for a given method. The method takes arguments 1488 // in the Java compiled code convention, marshals them to the native 1489 // convention (handlizes oops, etc), transitions to native, makes the call, 1490 // returns to java state (possibly blocking), unhandlizes any result and 1491 // returns. 1492 // 1493 // Critical native functions are a shorthand for the use of 1494 // GetPrimtiveArrayCritical and disallow the use of any other JNI 1495 // functions. The wrapper is expected to unpack the arguments before 1496 // passing them to the callee. Critical native functions leave the state _in_Java, 1497 // since they cannot stop for GC. 1498 // Some other parts of JNI setup are skipped like the tear down of the JNI handle 1499 // block and the check for pending exceptions it's impossible for them 1500 // to be thrown. 1501 // 1502 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm, 1503 const methodHandle& method, 1504 int compile_id, 1505 BasicType* in_sig_bt, 1506 VMRegPair* in_regs, 1507 BasicType ret_type) { 1508 if (method->is_method_handle_intrinsic()) { 1509 vmIntrinsics::ID iid = method->intrinsic_id(); 1510 intptr_t start = (intptr_t)__ pc(); 1511 int vep_offset = ((intptr_t)__ pc()) - start; 1512 gen_special_dispatch(masm, 1513 method, 1514 in_sig_bt, 1515 in_regs); 1516 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period 1517 __ flush(); 1518 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually 1519 return nmethod::new_native_nmethod(method, 1520 compile_id, 1521 masm->code(), 1522 vep_offset, 1523 frame_complete, 1524 stack_slots / VMRegImpl::slots_per_word, 1525 in_ByteSize(-1), 1526 in_ByteSize(-1), 1527 (OopMapSet*)NULL); 1528 } 1529 address native_func = method->native_function(); 1530 assert(native_func != NULL, "must have function"); 1531 1532 // An OopMap for lock (and class if static) 1533 OopMapSet *oop_maps = new OopMapSet(); 1534 intptr_t start = (intptr_t)__ pc(); 1535 1536 // We have received a description of where all the java arg are located 1537 // on entry to the wrapper. We need to convert these args to where 1538 // the jni function will expect them. To figure out where they go 1539 // we convert the java signature to a C signature by inserting 1540 // the hidden arguments as arg[0] and possibly arg[1] (static method) 1541 1542 const int total_in_args = method->size_of_parameters(); 1543 int total_c_args = total_in_args + (method->is_static() ? 2 : 1); 1544 1545 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args); 1546 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args); 1547 BasicType* in_elem_bt = NULL; 1548 1549 int argc = 0; 1550 out_sig_bt[argc++] = T_ADDRESS; 1551 if (method->is_static()) { 1552 out_sig_bt[argc++] = T_OBJECT; 1553 } 1554 1555 for (int i = 0; i < total_in_args ; i++ ) { 1556 out_sig_bt[argc++] = in_sig_bt[i]; 1557 } 1558 1559 // Now figure out where the args must be stored and how much stack space 1560 // they require. 1561 int out_arg_slots; 1562 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args); 1563 1564 // Compute framesize for the wrapper. We need to handlize all oops in 1565 // incoming registers 1566 1567 // Calculate the total number of stack slots we will need. 1568 1569 // First count the abi requirement plus all of the outgoing args 1570 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots; 1571 1572 // Now the space for the inbound oop handle area 1573 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers 1574 1575 int oop_handle_offset = stack_slots; 1576 stack_slots += total_save_slots; 1577 1578 // Now any space we need for handlizing a klass if static method 1579 1580 int klass_slot_offset = 0; 1581 int klass_offset = -1; 1582 int lock_slot_offset = 0; 1583 bool is_static = false; 1584 1585 if (method->is_static()) { 1586 klass_slot_offset = stack_slots; 1587 stack_slots += VMRegImpl::slots_per_word; 1588 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size; 1589 is_static = true; 1590 } 1591 1592 // Plus a lock if needed 1593 1594 if (method->is_synchronized()) { 1595 lock_slot_offset = stack_slots; 1596 stack_slots += VMRegImpl::slots_per_word; 1597 } 1598 1599 // Now a place (+2) to save return values or temp during shuffling 1600 // + 4 for return address (which we own) and saved rbp 1601 stack_slots += 6; 1602 1603 // Ok The space we have allocated will look like: 1604 // 1605 // 1606 // FP-> | | 1607 // |---------------------| 1608 // | 2 slots for moves | 1609 // |---------------------| 1610 // | lock box (if sync) | 1611 // |---------------------| <- lock_slot_offset 1612 // | klass (if static) | 1613 // |---------------------| <- klass_slot_offset 1614 // | oopHandle area | 1615 // |---------------------| <- oop_handle_offset (6 java arg registers) 1616 // | outbound memory | 1617 // | based arguments | 1618 // | | 1619 // |---------------------| 1620 // | | 1621 // SP-> | out_preserved_slots | 1622 // 1623 // 1624 1625 1626 // Now compute actual number of stack words we need rounding to make 1627 // stack properly aligned. 1628 stack_slots = align_up(stack_slots, StackAlignmentInSlots); 1629 1630 int stack_size = stack_slots * VMRegImpl::stack_slot_size; 1631 1632 // First thing make an ic check to see if we should even be here 1633 1634 // We are free to use all registers as temps without saving them and 1635 // restoring them except rbp. rbp is the only callee save register 1636 // as far as the interpreter and the compiler(s) are concerned. 1637 1638 1639 const Register ic_reg = rax; 1640 const Register receiver = j_rarg0; 1641 1642 Label hit; 1643 Label exception_pending; 1644 1645 assert_different_registers(ic_reg, receiver, rscratch1); 1646 __ verify_oop(receiver); 1647 __ load_klass(rscratch1, receiver, rscratch2); 1648 __ cmpq(ic_reg, rscratch1); 1649 __ jcc(Assembler::equal, hit); 1650 1651 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1652 1653 // Verified entry point must be aligned 1654 __ align(8); 1655 1656 __ bind(hit); 1657 1658 int vep_offset = ((intptr_t)__ pc()) - start; 1659 1660 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) { 1661 Label L_skip_barrier; 1662 Register klass = r10; 1663 __ mov_metadata(klass, method->method_holder()); // InstanceKlass* 1664 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1665 1666 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1667 1668 __ bind(L_skip_barrier); 1669 } 1670 1671 #ifdef COMPILER1 1672 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available. 1673 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) { 1674 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/); 1675 } 1676 #endif // COMPILER1 1677 1678 // The instruction at the verified entry point must be 5 bytes or longer 1679 // because it can be patched on the fly by make_non_entrant. The stack bang 1680 // instruction fits that requirement. 1681 1682 // Generate stack overflow check 1683 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size()); 1684 1685 // Generate a new frame for the wrapper. 1686 __ enter(); 1687 // -2 because return address is already present and so is saved rbp 1688 __ subptr(rsp, stack_size - 2*wordSize); 1689 1690 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1691 bs->nmethod_entry_barrier(masm); 1692 1693 // Frame is now completed as far as size and linkage. 1694 int frame_complete = ((intptr_t)__ pc()) - start; 1695 1696 if (UseRTMLocking) { 1697 // Abort RTM transaction before calling JNI 1698 // because critical section will be large and will be 1699 // aborted anyway. Also nmethod could be deoptimized. 1700 __ xabort(0); 1701 } 1702 1703 #ifdef ASSERT 1704 { 1705 Label L; 1706 __ mov(rax, rsp); 1707 __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI) 1708 __ cmpptr(rax, rsp); 1709 __ jcc(Assembler::equal, L); 1710 __ stop("improperly aligned stack"); 1711 __ bind(L); 1712 } 1713 #endif /* ASSERT */ 1714 1715 1716 // We use r14 as the oop handle for the receiver/klass 1717 // It is callee save so it survives the call to native 1718 1719 const Register oop_handle_reg = r14; 1720 1721 // 1722 // We immediately shuffle the arguments so that any vm call we have to 1723 // make from here on out (sync slow path, jvmti, etc.) we will have 1724 // captured the oops from our caller and have a valid oopMap for 1725 // them. 1726 1727 // ----------------- 1728 // The Grand Shuffle 1729 1730 // The Java calling convention is either equal (linux) or denser (win64) than the 1731 // c calling convention. However the because of the jni_env argument the c calling 1732 // convention always has at least one more (and two for static) arguments than Java. 1733 // Therefore if we move the args from java -> c backwards then we will never have 1734 // a register->register conflict and we don't have to build a dependency graph 1735 // and figure out how to break any cycles. 1736 // 1737 1738 // Record esp-based slot for receiver on stack for non-static methods 1739 int receiver_offset = -1; 1740 1741 // This is a trick. We double the stack slots so we can claim 1742 // the oops in the caller's frame. Since we are sure to have 1743 // more args than the caller doubling is enough to make 1744 // sure we can capture all the incoming oop args from the 1745 // caller. 1746 // 1747 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/); 1748 1749 // Mark location of rbp (someday) 1750 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp)); 1751 1752 // Use eax, ebx as temporaries during any memory-memory moves we have to do 1753 // All inbound args are referenced based on rbp and all outbound args via rsp. 1754 1755 1756 #ifdef ASSERT 1757 bool reg_destroyed[RegisterImpl::number_of_registers]; 1758 bool freg_destroyed[XMMRegisterImpl::number_of_registers]; 1759 for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) { 1760 reg_destroyed[r] = false; 1761 } 1762 for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) { 1763 freg_destroyed[f] = false; 1764 } 1765 1766 #endif /* ASSERT */ 1767 1768 // For JNI natives the incoming and outgoing registers are offset upwards. 1769 GrowableArray<int> arg_order(2 * total_in_args); 1770 1771 VMRegPair tmp_vmreg; 1772 tmp_vmreg.set2(rbx->as_VMReg()); 1773 1774 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 1775 arg_order.push(i); 1776 arg_order.push(c_arg); 1777 } 1778 1779 int temploc = -1; 1780 for (int ai = 0; ai < arg_order.length(); ai += 2) { 1781 int i = arg_order.at(ai); 1782 int c_arg = arg_order.at(ai + 1); 1783 __ block_comment(err_msg("move %d -> %d", i, c_arg)); 1784 #ifdef ASSERT 1785 if (in_regs[i].first()->is_Register()) { 1786 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!"); 1787 } else if (in_regs[i].first()->is_XMMRegister()) { 1788 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!"); 1789 } 1790 if (out_regs[c_arg].first()->is_Register()) { 1791 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 1792 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 1793 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 1794 } 1795 #endif /* ASSERT */ 1796 switch (in_sig_bt[i]) { 1797 case T_ARRAY: 1798 case T_OBJECT: 1799 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg], 1800 ((i == 0) && (!is_static)), 1801 &receiver_offset); 1802 break; 1803 case T_VOID: 1804 break; 1805 1806 case T_FLOAT: 1807 __ float_move(in_regs[i], out_regs[c_arg]); 1808 break; 1809 1810 case T_DOUBLE: 1811 assert( i + 1 < total_in_args && 1812 in_sig_bt[i + 1] == T_VOID && 1813 out_sig_bt[c_arg+1] == T_VOID, "bad arg list"); 1814 __ double_move(in_regs[i], out_regs[c_arg]); 1815 break; 1816 1817 case T_LONG : 1818 __ long_move(in_regs[i], out_regs[c_arg]); 1819 break; 1820 1821 case T_ADDRESS: assert(false, "found T_ADDRESS in java args"); 1822 1823 default: 1824 __ move32_64(in_regs[i], out_regs[c_arg]); 1825 } 1826 } 1827 1828 int c_arg; 1829 1830 // Pre-load a static method's oop into r14. Used both by locking code and 1831 // the normal JNI call code. 1832 // point c_arg at the first arg that is already loaded in case we 1833 // need to spill before we call out 1834 c_arg = total_c_args - total_in_args; 1835 1836 if (method->is_static()) { 1837 1838 // load oop into a register 1839 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror())); 1840 1841 // Now handlize the static class mirror it's known not-null. 1842 __ movptr(Address(rsp, klass_offset), oop_handle_reg); 1843 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset)); 1844 1845 // Now get the handle 1846 __ lea(oop_handle_reg, Address(rsp, klass_offset)); 1847 // store the klass handle as second argument 1848 __ movptr(c_rarg1, oop_handle_reg); 1849 // and protect the arg if we must spill 1850 c_arg--; 1851 } 1852 1853 // Change state to native (we save the return address in the thread, since it might not 1854 // be pushed on the stack when we do a a stack traversal). It is enough that the pc() 1855 // points into the right code segment. It does not have to be the correct return pc. 1856 // We use the same pc/oopMap repeatedly when we call out 1857 1858 intptr_t the_pc = (intptr_t) __ pc(); 1859 oop_maps->add_gc_map(the_pc - start, map); 1860 1861 __ set_last_Java_frame(rsp, noreg, (address)the_pc); 1862 1863 1864 // We have all of the arguments setup at this point. We must not touch any register 1865 // argument registers at this point (what if we save/restore them there are no oop? 1866 1867 { 1868 SkipIfEqual skip(masm, &DTraceMethodProbes, false); 1869 // protect the args we've loaded 1870 save_args(masm, total_c_args, c_arg, out_regs); 1871 __ mov_metadata(c_rarg1, method()); 1872 __ call_VM_leaf( 1873 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry), 1874 r15_thread, c_rarg1); 1875 restore_args(masm, total_c_args, c_arg, out_regs); 1876 } 1877 1878 // RedefineClasses() tracing support for obsolete method entry 1879 if (log_is_enabled(Trace, redefine, class, obsolete)) { 1880 // protect the args we've loaded 1881 save_args(masm, total_c_args, c_arg, out_regs); 1882 __ mov_metadata(c_rarg1, method()); 1883 __ call_VM_leaf( 1884 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry), 1885 r15_thread, c_rarg1); 1886 restore_args(masm, total_c_args, c_arg, out_regs); 1887 } 1888 1889 // Lock a synchronized method 1890 1891 // Register definitions used by locking and unlocking 1892 1893 const Register swap_reg = rax; // Must use rax for cmpxchg instruction 1894 const Register obj_reg = rbx; // Will contain the oop 1895 const Register lock_reg = r13; // Address of compiler lock object (BasicLock) 1896 const Register old_hdr = r13; // value of old header at unlock time 1897 1898 Label slow_path_lock; 1899 Label lock_done; 1900 1901 if (method->is_synchronized()) { 1902 1903 const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes(); 1904 1905 // Get the handle (the 2nd argument) 1906 __ mov(oop_handle_reg, c_rarg1); 1907 1908 // Get address of the box 1909 1910 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 1911 1912 // Load the oop from the handle 1913 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 1914 1915 if (!UseHeavyMonitors) { 1916 // Load immediate 1 into swap_reg %rax 1917 __ movl(swap_reg, 1); 1918 1919 // Load (object->mark() | 1) into swap_reg %rax 1920 __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 1921 1922 // Save (object->mark() | 1) into BasicLock's displaced header 1923 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 1924 1925 // src -> dest iff dest == rax else rax <- dest 1926 __ lock(); 1927 __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 1928 __ jcc(Assembler::equal, lock_done); 1929 1930 // Hmm should this move to the slow path code area??? 1931 1932 // Test if the oopMark is an obvious stack pointer, i.e., 1933 // 1) (mark & 3) == 0, and 1934 // 2) rsp <= mark < mark + os::pagesize() 1935 // These 3 tests can be done by evaluating the following 1936 // expression: ((mark - rsp) & (3 - os::vm_page_size())), 1937 // assuming both stack pointer and pagesize have their 1938 // least significant 2 bits clear. 1939 // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg 1940 1941 __ subptr(swap_reg, rsp); 1942 __ andptr(swap_reg, 3 - os::vm_page_size()); 1943 1944 // Save the test result, for recursive case, the result is zero 1945 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 1946 __ jcc(Assembler::notEqual, slow_path_lock); 1947 } else { 1948 __ jmp(slow_path_lock); 1949 } 1950 1951 // Slow path will re-enter here 1952 1953 __ bind(lock_done); 1954 } 1955 1956 // Finally just about ready to make the JNI call 1957 1958 // get JNIEnv* which is first argument to native 1959 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset()))); 1960 1961 // Now set thread in native 1962 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 1963 1964 __ call(RuntimeAddress(native_func)); 1965 1966 // Verify or restore cpu control state after JNI call 1967 __ restore_cpu_control_state_after_jni(); 1968 1969 // Unpack native results. 1970 switch (ret_type) { 1971 case T_BOOLEAN: __ c2bool(rax); break; 1972 case T_CHAR : __ movzwl(rax, rax); break; 1973 case T_BYTE : __ sign_extend_byte (rax); break; 1974 case T_SHORT : __ sign_extend_short(rax); break; 1975 case T_INT : /* nothing to do */ break; 1976 case T_DOUBLE : 1977 case T_FLOAT : 1978 // Result is in xmm0 we'll save as needed 1979 break; 1980 case T_ARRAY: // Really a handle 1981 case T_OBJECT: // Really a handle 1982 break; // can't de-handlize until after safepoint check 1983 case T_VOID: break; 1984 case T_LONG: break; 1985 default : ShouldNotReachHere(); 1986 } 1987 1988 Label after_transition; 1989 1990 // Switch thread to "native transition" state before reading the synchronization state. 1991 // This additional state is necessary because reading and testing the synchronization 1992 // state is not atomic w.r.t. GC, as this scenario demonstrates: 1993 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted. 1994 // VM thread changes sync state to synchronizing and suspends threads for GC. 1995 // Thread A is resumed to finish this native method, but doesn't block here since it 1996 // didn't see any synchronization is progress, and escapes. 1997 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 1998 1999 // Force this write out before the read below 2000 __ membar(Assembler::Membar_mask_bits( 2001 Assembler::LoadLoad | Assembler::LoadStore | 2002 Assembler::StoreLoad | Assembler::StoreStore)); 2003 2004 // check for safepoint operation in progress and/or pending suspend requests 2005 { 2006 Label Continue; 2007 Label slow_path; 2008 2009 __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */); 2010 2011 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2012 __ jcc(Assembler::equal, Continue); 2013 __ bind(slow_path); 2014 2015 // Don't use call_VM as it will see a possible pending exception and forward it 2016 // and never return here preventing us from clearing _last_native_pc down below. 2017 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are 2018 // preserved and correspond to the bcp/locals pointers. So we do a runtime call 2019 // by hand. 2020 // 2021 __ vzeroupper(); 2022 save_native_result(masm, ret_type, stack_slots); 2023 __ mov(c_rarg0, r15_thread); 2024 __ mov(r12, rsp); // remember sp 2025 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2026 __ andptr(rsp, -16); // align stack as required by ABI 2027 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 2028 __ mov(rsp, r12); // restore sp 2029 __ reinit_heapbase(); 2030 // Restore any method result value 2031 restore_native_result(masm, ret_type, stack_slots); 2032 __ bind(Continue); 2033 } 2034 2035 // change thread state 2036 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 2037 __ bind(after_transition); 2038 2039 Label reguard; 2040 Label reguard_done; 2041 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 2042 __ jcc(Assembler::equal, reguard); 2043 __ bind(reguard_done); 2044 2045 // native result if any is live 2046 2047 // Unlock 2048 Label unlock_done; 2049 Label slow_path_unlock; 2050 if (method->is_synchronized()) { 2051 2052 // Get locked oop from the handle we passed to jni 2053 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2054 2055 Label done; 2056 2057 if (!UseHeavyMonitors) { 2058 // Simple recursive lock? 2059 __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD); 2060 __ jcc(Assembler::equal, done); 2061 } 2062 2063 // Must save rax if it is live now because cmpxchg must use it 2064 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2065 save_native_result(masm, ret_type, stack_slots); 2066 } 2067 2068 2069 if (!UseHeavyMonitors) { 2070 // get address of the stack lock 2071 __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2072 // get old displaced header 2073 __ movptr(old_hdr, Address(rax, 0)); 2074 2075 // Atomic swap old header if oop still contains the stack lock 2076 __ lock(); 2077 __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2078 __ jcc(Assembler::notEqual, slow_path_unlock); 2079 } else { 2080 __ jmp(slow_path_unlock); 2081 } 2082 2083 // slow path re-enters here 2084 __ bind(unlock_done); 2085 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2086 restore_native_result(masm, ret_type, stack_slots); 2087 } 2088 2089 __ bind(done); 2090 2091 } 2092 { 2093 SkipIfEqual skip(masm, &DTraceMethodProbes, false); 2094 save_native_result(masm, ret_type, stack_slots); 2095 __ mov_metadata(c_rarg1, method()); 2096 __ call_VM_leaf( 2097 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit), 2098 r15_thread, c_rarg1); 2099 restore_native_result(masm, ret_type, stack_slots); 2100 } 2101 2102 __ reset_last_Java_frame(false); 2103 2104 // Unbox oop result, e.g. JNIHandles::resolve value. 2105 if (is_reference_type(ret_type)) { 2106 __ resolve_jobject(rax /* value */, 2107 r15_thread /* thread */, 2108 rcx /* tmp */); 2109 } 2110 2111 if (CheckJNICalls) { 2112 // clear_pending_jni_exception_check 2113 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD); 2114 } 2115 2116 // reset handle block 2117 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset())); 2118 __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD); 2119 2120 // pop our frame 2121 2122 __ leave(); 2123 2124 // Any exception pending? 2125 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD); 2126 __ jcc(Assembler::notEqual, exception_pending); 2127 2128 // Return 2129 2130 __ ret(0); 2131 2132 // Unexpected paths are out of line and go here 2133 2134 // forward the exception 2135 __ bind(exception_pending); 2136 2137 // and forward the exception 2138 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 2139 2140 // Slow path locking & unlocking 2141 if (method->is_synchronized()) { 2142 2143 // BEGIN Slow path lock 2144 __ bind(slow_path_lock); 2145 2146 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM 2147 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2148 2149 // protect the args we've loaded 2150 save_args(masm, total_c_args, c_arg, out_regs); 2151 2152 __ mov(c_rarg0, obj_reg); 2153 __ mov(c_rarg1, lock_reg); 2154 __ mov(c_rarg2, r15_thread); 2155 2156 // Not a leaf but we have last_Java_frame setup as we want 2157 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3); 2158 restore_args(masm, total_c_args, c_arg, out_regs); 2159 2160 #ifdef ASSERT 2161 { Label L; 2162 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD); 2163 __ jcc(Assembler::equal, L); 2164 __ stop("no pending exception allowed on exit from monitorenter"); 2165 __ bind(L); 2166 } 2167 #endif 2168 __ jmp(lock_done); 2169 2170 // END Slow path lock 2171 2172 // BEGIN Slow path unlock 2173 __ bind(slow_path_unlock); 2174 2175 // If we haven't already saved the native result we must save it now as xmm registers 2176 // are still exposed. 2177 __ vzeroupper(); 2178 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2179 save_native_result(masm, ret_type, stack_slots); 2180 } 2181 2182 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2183 2184 __ mov(c_rarg0, obj_reg); 2185 __ mov(c_rarg2, r15_thread); 2186 __ mov(r12, rsp); // remember sp 2187 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2188 __ andptr(rsp, -16); // align stack as required by ABI 2189 2190 // Save pending exception around call to VM (which contains an EXCEPTION_MARK) 2191 // NOTE that obj_reg == rbx currently 2192 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset()))); 2193 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD); 2194 2195 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2196 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C))); 2197 __ mov(rsp, r12); // restore sp 2198 __ reinit_heapbase(); 2199 #ifdef ASSERT 2200 { 2201 Label L; 2202 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD); 2203 __ jcc(Assembler::equal, L); 2204 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C"); 2205 __ bind(L); 2206 } 2207 #endif /* ASSERT */ 2208 2209 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx); 2210 2211 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2212 restore_native_result(masm, ret_type, stack_slots); 2213 } 2214 __ jmp(unlock_done); 2215 2216 // END Slow path unlock 2217 2218 } // synchronized 2219 2220 // SLOW PATH Reguard the stack if needed 2221 2222 __ bind(reguard); 2223 __ vzeroupper(); 2224 save_native_result(masm, ret_type, stack_slots); 2225 __ mov(r12, rsp); // remember sp 2226 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2227 __ andptr(rsp, -16); // align stack as required by ABI 2228 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 2229 __ mov(rsp, r12); // restore sp 2230 __ reinit_heapbase(); 2231 restore_native_result(masm, ret_type, stack_slots); 2232 // and continue 2233 __ jmp(reguard_done); 2234 2235 2236 2237 __ flush(); 2238 2239 nmethod *nm = nmethod::new_native_nmethod(method, 2240 compile_id, 2241 masm->code(), 2242 vep_offset, 2243 frame_complete, 2244 stack_slots / VMRegImpl::slots_per_word, 2245 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)), 2246 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size), 2247 oop_maps); 2248 2249 return nm; 2250 } 2251 2252 // this function returns the adjust size (in number of words) to a c2i adapter 2253 // activation for use during deoptimization 2254 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) { 2255 return (callee_locals - callee_parameters) * Interpreter::stackElementWords; 2256 } 2257 2258 2259 uint SharedRuntime::out_preserve_stack_slots() { 2260 return 0; 2261 } 2262 2263 2264 // Number of stack slots between incoming argument block and the start of 2265 // a new frame. The PROLOG must add this many slots to the stack. The 2266 // EPILOG must remove this many slots. amd64 needs two slots for 2267 // return address. 2268 uint SharedRuntime::in_preserve_stack_slots() { 2269 return 4 + 2 * VerifyStackAtCalls; 2270 } 2271 2272 //------------------------------generate_deopt_blob---------------------------- 2273 void SharedRuntime::generate_deopt_blob() { 2274 // Allocate space for the code 2275 ResourceMark rm; 2276 // Setup code generation tools 2277 int pad = 0; 2278 if (UseAVX > 2) { 2279 pad += 1024; 2280 } 2281 #if INCLUDE_JVMCI 2282 if (EnableJVMCI) { 2283 pad += 512; // Increase the buffer size when compiling for JVMCI 2284 } 2285 #endif 2286 CodeBuffer buffer("deopt_blob", 2560+pad, 1024); 2287 MacroAssembler* masm = new MacroAssembler(&buffer); 2288 int frame_size_in_words; 2289 OopMap* map = NULL; 2290 OopMapSet *oop_maps = new OopMapSet(); 2291 2292 // ------------- 2293 // This code enters when returning to a de-optimized nmethod. A return 2294 // address has been pushed on the the stack, and return values are in 2295 // registers. 2296 // If we are doing a normal deopt then we were called from the patched 2297 // nmethod from the point we returned to the nmethod. So the return 2298 // address on the stack is wrong by NativeCall::instruction_size 2299 // We will adjust the value so it looks like we have the original return 2300 // address on the stack (like when we eagerly deoptimized). 2301 // In the case of an exception pending when deoptimizing, we enter 2302 // with a return address on the stack that points after the call we patched 2303 // into the exception handler. We have the following register state from, 2304 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp). 2305 // rax: exception oop 2306 // rbx: exception handler 2307 // rdx: throwing pc 2308 // So in this case we simply jam rdx into the useless return address and 2309 // the stack looks just like we want. 2310 // 2311 // At this point we need to de-opt. We save the argument return 2312 // registers. We call the first C routine, fetch_unroll_info(). This 2313 // routine captures the return values and returns a structure which 2314 // describes the current frame size and the sizes of all replacement frames. 2315 // The current frame is compiled code and may contain many inlined 2316 // functions, each with their own JVM state. We pop the current frame, then 2317 // push all the new frames. Then we call the C routine unpack_frames() to 2318 // populate these frames. Finally unpack_frames() returns us the new target 2319 // address. Notice that callee-save registers are BLOWN here; they have 2320 // already been captured in the vframeArray at the time the return PC was 2321 // patched. 2322 address start = __ pc(); 2323 Label cont; 2324 2325 // Prolog for non exception case! 2326 2327 // Save everything in sight. 2328 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true); 2329 2330 // Normal deoptimization. Save exec mode for unpack_frames. 2331 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved 2332 __ jmp(cont); 2333 2334 int reexecute_offset = __ pc() - start; 2335 #if INCLUDE_JVMCI && !defined(COMPILER1) 2336 if (EnableJVMCI && UseJVMCICompiler) { 2337 // JVMCI does not use this kind of deoptimization 2338 __ should_not_reach_here(); 2339 } 2340 #endif 2341 2342 // Reexecute case 2343 // return address is the pc describes what bci to do re-execute at 2344 2345 // No need to update map as each call to save_live_registers will produce identical oopmap 2346 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true); 2347 2348 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved 2349 __ jmp(cont); 2350 2351 #if INCLUDE_JVMCI 2352 Label after_fetch_unroll_info_call; 2353 int implicit_exception_uncommon_trap_offset = 0; 2354 int uncommon_trap_offset = 0; 2355 2356 if (EnableJVMCI) { 2357 implicit_exception_uncommon_trap_offset = __ pc() - start; 2358 2359 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset()))); 2360 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), (int32_t)NULL_WORD); 2361 2362 uncommon_trap_offset = __ pc() - start; 2363 2364 // Save everything in sight. 2365 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true); 2366 // fetch_unroll_info needs to call last_java_frame() 2367 __ set_last_Java_frame(noreg, noreg, NULL); 2368 2369 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset()))); 2370 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1); 2371 2372 __ movl(r14, (int32_t)Deoptimization::Unpack_reexecute); 2373 __ mov(c_rarg0, r15_thread); 2374 __ movl(c_rarg2, r14); // exec mode 2375 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2376 oop_maps->add_gc_map( __ pc()-start, map->deep_copy()); 2377 2378 __ reset_last_Java_frame(false); 2379 2380 __ jmp(after_fetch_unroll_info_call); 2381 } // EnableJVMCI 2382 #endif // INCLUDE_JVMCI 2383 2384 int exception_offset = __ pc() - start; 2385 2386 // Prolog for exception case 2387 2388 // all registers are dead at this entry point, except for rax, and 2389 // rdx which contain the exception oop and exception pc 2390 // respectively. Set them in TLS and fall thru to the 2391 // unpack_with_exception_in_tls entry point. 2392 2393 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 2394 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax); 2395 2396 int exception_in_tls_offset = __ pc() - start; 2397 2398 // new implementation because exception oop is now passed in JavaThread 2399 2400 // Prolog for exception case 2401 // All registers must be preserved because they might be used by LinearScan 2402 // Exceptiop oop and throwing PC are passed in JavaThread 2403 // tos: stack at point of call to method that threw the exception (i.e. only 2404 // args are on the stack, no return address) 2405 2406 // make room on stack for the return address 2407 // It will be patched later with the throwing pc. The correct value is not 2408 // available now because loading it from memory would destroy registers. 2409 __ push(0); 2410 2411 // Save everything in sight. 2412 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true); 2413 2414 // Now it is safe to overwrite any register 2415 2416 // Deopt during an exception. Save exec mode for unpack_frames. 2417 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved 2418 2419 // load throwing pc from JavaThread and patch it as the return address 2420 // of the current frame. Then clear the field in JavaThread 2421 2422 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2423 __ movptr(Address(rbp, wordSize), rdx); 2424 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD); 2425 2426 #ifdef ASSERT 2427 // verify that there is really an exception oop in JavaThread 2428 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2429 __ verify_oop(rax); 2430 2431 // verify that there is no pending exception 2432 Label no_pending_exception; 2433 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 2434 __ testptr(rax, rax); 2435 __ jcc(Assembler::zero, no_pending_exception); 2436 __ stop("must not have pending exception here"); 2437 __ bind(no_pending_exception); 2438 #endif 2439 2440 __ bind(cont); 2441 2442 // Call C code. Need thread and this frame, but NOT official VM entry 2443 // crud. We cannot block on this call, no GC can happen. 2444 // 2445 // UnrollBlock* fetch_unroll_info(JavaThread* thread) 2446 2447 // fetch_unroll_info needs to call last_java_frame(). 2448 2449 __ set_last_Java_frame(noreg, noreg, NULL); 2450 #ifdef ASSERT 2451 { Label L; 2452 __ cmpptr(Address(r15_thread, 2453 JavaThread::last_Java_fp_offset()), 2454 (int32_t)0); 2455 __ jcc(Assembler::equal, L); 2456 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared"); 2457 __ bind(L); 2458 } 2459 #endif // ASSERT 2460 __ mov(c_rarg0, r15_thread); 2461 __ movl(c_rarg1, r14); // exec_mode 2462 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info))); 2463 2464 // Need to have an oopmap that tells fetch_unroll_info where to 2465 // find any register it might need. 2466 oop_maps->add_gc_map(__ pc() - start, map); 2467 2468 __ reset_last_Java_frame(false); 2469 2470 #if INCLUDE_JVMCI 2471 if (EnableJVMCI) { 2472 __ bind(after_fetch_unroll_info_call); 2473 } 2474 #endif 2475 2476 // Load UnrollBlock* into rdi 2477 __ mov(rdi, rax); 2478 2479 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes())); 2480 Label noException; 2481 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending? 2482 __ jcc(Assembler::notEqual, noException); 2483 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2484 // QQQ this is useless it was NULL above 2485 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2486 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD); 2487 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD); 2488 2489 __ verify_oop(rax); 2490 2491 // Overwrite the result registers with the exception results. 2492 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2493 // I think this is useless 2494 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx); 2495 2496 __ bind(noException); 2497 2498 // Only register save data is on the stack. 2499 // Now restore the result registers. Everything else is either dead 2500 // or captured in the vframeArray. 2501 RegisterSaver::restore_result_registers(masm); 2502 2503 // All of the register save area has been popped of the stack. Only the 2504 // return address remains. 2505 2506 // Pop all the frames we must move/replace. 2507 // 2508 // Frame picture (youngest to oldest) 2509 // 1: self-frame (no frame link) 2510 // 2: deopting frame (no frame link) 2511 // 3: caller of deopting frame (could be compiled/interpreted). 2512 // 2513 // Note: by leaving the return address of self-frame on the stack 2514 // and using the size of frame 2 to adjust the stack 2515 // when we are done the return to frame 3 will still be on the stack. 2516 2517 // Pop deoptimized frame 2518 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes())); 2519 __ addptr(rsp, rcx); 2520 2521 // rsp should be pointing at the return address to the caller (3) 2522 2523 // Pick up the initial fp we should save 2524 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2525 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes())); 2526 2527 #ifdef ASSERT 2528 // Compilers generate code that bang the stack by as much as the 2529 // interpreter would need. So this stack banging should never 2530 // trigger a fault. Verify that it does not on non product builds. 2531 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes())); 2532 __ bang_stack_size(rbx, rcx); 2533 #endif 2534 2535 // Load address of array of frame pcs into rcx 2536 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes())); 2537 2538 // Trash the old pc 2539 __ addptr(rsp, wordSize); 2540 2541 // Load address of array of frame sizes into rsi 2542 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes())); 2543 2544 // Load counter into rdx 2545 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes())); 2546 2547 // Now adjust the caller's stack to make up for the extra locals 2548 // but record the original sp so that we can save it in the skeletal interpreter 2549 // frame and the stack walking of interpreter_sender will get the unextended sp 2550 // value and not the "real" sp value. 2551 2552 const Register sender_sp = r8; 2553 2554 __ mov(sender_sp, rsp); 2555 __ movl(rbx, Address(rdi, 2556 Deoptimization::UnrollBlock:: 2557 caller_adjustment_offset_in_bytes())); 2558 __ subptr(rsp, rbx); 2559 2560 // Push interpreter frames in a loop 2561 Label loop; 2562 __ bind(loop); 2563 __ movptr(rbx, Address(rsi, 0)); // Load frame size 2564 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand 2565 __ pushptr(Address(rcx, 0)); // Save return address 2566 __ enter(); // Save old & set new ebp 2567 __ subptr(rsp, rbx); // Prolog 2568 // This value is corrected by layout_activation_impl 2569 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD ); 2570 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable 2571 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 2572 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 2573 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 2574 __ decrementl(rdx); // Decrement counter 2575 __ jcc(Assembler::notZero, loop); 2576 __ pushptr(Address(rcx, 0)); // Save final return address 2577 2578 // Re-push self-frame 2579 __ enter(); // Save old & set new ebp 2580 2581 // Allocate a full sized register save area. 2582 // Return address and rbp are in place, so we allocate two less words. 2583 __ subptr(rsp, (frame_size_in_words - 2) * wordSize); 2584 2585 // Restore frame locals after moving the frame 2586 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0); 2587 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2588 2589 // Call C code. Need thread but NOT official VM entry 2590 // crud. We cannot block on this call, no GC can happen. Call should 2591 // restore return values to their stack-slots with the new SP. 2592 // 2593 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode) 2594 2595 // Use rbp because the frames look interpreted now 2596 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 2597 // Don't need the precise return PC here, just precise enough to point into this code blob. 2598 address the_pc = __ pc(); 2599 __ set_last_Java_frame(noreg, rbp, the_pc); 2600 2601 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI 2602 __ mov(c_rarg0, r15_thread); 2603 __ movl(c_rarg1, r14); // second arg: exec_mode 2604 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 2605 // Revert SP alignment after call since we're going to do some SP relative addressing below 2606 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset())); 2607 2608 // Set an oopmap for the call site 2609 // Use the same PC we used for the last java frame 2610 oop_maps->add_gc_map(the_pc - start, 2611 new OopMap( frame_size_in_words, 0 )); 2612 2613 // Clear fp AND pc 2614 __ reset_last_Java_frame(true); 2615 2616 // Collect return values 2617 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes())); 2618 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes())); 2619 // I think this is useless (throwing pc?) 2620 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes())); 2621 2622 // Pop self-frame. 2623 __ leave(); // Epilog 2624 2625 // Jump to interpreter 2626 __ ret(0); 2627 2628 // Make sure all code is generated 2629 masm->flush(); 2630 2631 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words); 2632 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset); 2633 #if INCLUDE_JVMCI 2634 if (EnableJVMCI) { 2635 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset); 2636 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset); 2637 } 2638 #endif 2639 } 2640 2641 #ifdef COMPILER2 2642 //------------------------------generate_uncommon_trap_blob-------------------- 2643 void SharedRuntime::generate_uncommon_trap_blob() { 2644 // Allocate space for the code 2645 ResourceMark rm; 2646 // Setup code generation tools 2647 CodeBuffer buffer("uncommon_trap_blob", 2048, 1024); 2648 MacroAssembler* masm = new MacroAssembler(&buffer); 2649 2650 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 2651 2652 address start = __ pc(); 2653 2654 if (UseRTMLocking) { 2655 // Abort RTM transaction before possible nmethod deoptimization. 2656 __ xabort(0); 2657 } 2658 2659 // Push self-frame. We get here with a return address on the 2660 // stack, so rsp is 8-byte aligned until we allocate our frame. 2661 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog! 2662 2663 // No callee saved registers. rbp is assumed implicitly saved 2664 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 2665 2666 // compiler left unloaded_class_index in j_rarg0 move to where the 2667 // runtime expects it. 2668 __ movl(c_rarg1, j_rarg0); 2669 2670 __ set_last_Java_frame(noreg, noreg, NULL); 2671 2672 // Call C code. Need thread but NOT official VM entry 2673 // crud. We cannot block on this call, no GC can happen. Call should 2674 // capture callee-saved registers as well as return values. 2675 // Thread is in rdi already. 2676 // 2677 // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index); 2678 2679 __ mov(c_rarg0, r15_thread); 2680 __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap); 2681 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2682 2683 // Set an oopmap for the call site 2684 OopMapSet* oop_maps = new OopMapSet(); 2685 OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0); 2686 2687 // location of rbp is known implicitly by the frame sender code 2688 2689 oop_maps->add_gc_map(__ pc() - start, map); 2690 2691 __ reset_last_Java_frame(false); 2692 2693 // Load UnrollBlock* into rdi 2694 __ mov(rdi, rax); 2695 2696 #ifdef ASSERT 2697 { Label L; 2698 __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()), 2699 (int32_t)Deoptimization::Unpack_uncommon_trap); 2700 __ jcc(Assembler::equal, L); 2701 __ stop("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap"); 2702 __ bind(L); 2703 } 2704 #endif 2705 2706 // Pop all the frames we must move/replace. 2707 // 2708 // Frame picture (youngest to oldest) 2709 // 1: self-frame (no frame link) 2710 // 2: deopting frame (no frame link) 2711 // 3: caller of deopting frame (could be compiled/interpreted). 2712 2713 // Pop self-frame. We have no frame, and must rely only on rax and rsp. 2714 __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog! 2715 2716 // Pop deoptimized frame (int) 2717 __ movl(rcx, Address(rdi, 2718 Deoptimization::UnrollBlock:: 2719 size_of_deoptimized_frame_offset_in_bytes())); 2720 __ addptr(rsp, rcx); 2721 2722 // rsp should be pointing at the return address to the caller (3) 2723 2724 // Pick up the initial fp we should save 2725 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2726 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes())); 2727 2728 #ifdef ASSERT 2729 // Compilers generate code that bang the stack by as much as the 2730 // interpreter would need. So this stack banging should never 2731 // trigger a fault. Verify that it does not on non product builds. 2732 __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes())); 2733 __ bang_stack_size(rbx, rcx); 2734 #endif 2735 2736 // Load address of array of frame pcs into rcx (address*) 2737 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes())); 2738 2739 // Trash the return pc 2740 __ addptr(rsp, wordSize); 2741 2742 // Load address of array of frame sizes into rsi (intptr_t*) 2743 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes())); 2744 2745 // Counter 2746 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int) 2747 2748 // Now adjust the caller's stack to make up for the extra locals but 2749 // record the original sp so that we can save it in the skeletal 2750 // interpreter frame and the stack walking of interpreter_sender 2751 // will get the unextended sp value and not the "real" sp value. 2752 2753 const Register sender_sp = r8; 2754 2755 __ mov(sender_sp, rsp); 2756 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int) 2757 __ subptr(rsp, rbx); 2758 2759 // Push interpreter frames in a loop 2760 Label loop; 2761 __ bind(loop); 2762 __ movptr(rbx, Address(rsi, 0)); // Load frame size 2763 __ subptr(rbx, 2 * wordSize); // We'll push pc and rbp by hand 2764 __ pushptr(Address(rcx, 0)); // Save return address 2765 __ enter(); // Save old & set new rbp 2766 __ subptr(rsp, rbx); // Prolog 2767 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), 2768 sender_sp); // Make it walkable 2769 // This value is corrected by layout_activation_impl 2770 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD ); 2771 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 2772 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 2773 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 2774 __ decrementl(rdx); // Decrement counter 2775 __ jcc(Assembler::notZero, loop); 2776 __ pushptr(Address(rcx, 0)); // Save final return address 2777 2778 // Re-push self-frame 2779 __ enter(); // Save old & set new rbp 2780 __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt); 2781 // Prolog 2782 2783 // Use rbp because the frames look interpreted now 2784 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 2785 // Don't need the precise return PC here, just precise enough to point into this code blob. 2786 address the_pc = __ pc(); 2787 __ set_last_Java_frame(noreg, rbp, the_pc); 2788 2789 // Call C code. Need thread but NOT official VM entry 2790 // crud. We cannot block on this call, no GC can happen. Call should 2791 // restore return values to their stack-slots with the new SP. 2792 // Thread is in rdi already. 2793 // 2794 // BasicType unpack_frames(JavaThread* thread, int exec_mode); 2795 2796 __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI 2797 __ mov(c_rarg0, r15_thread); 2798 __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap); 2799 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 2800 2801 // Set an oopmap for the call site 2802 // Use the same PC we used for the last java frame 2803 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 2804 2805 // Clear fp AND pc 2806 __ reset_last_Java_frame(true); 2807 2808 // Pop self-frame. 2809 __ leave(); // Epilog 2810 2811 // Jump to interpreter 2812 __ ret(0); 2813 2814 // Make sure all code is generated 2815 masm->flush(); 2816 2817 _uncommon_trap_blob = UncommonTrapBlob::create(&buffer, oop_maps, 2818 SimpleRuntimeFrame::framesize >> 1); 2819 } 2820 #endif // COMPILER2 2821 2822 //------------------------------generate_handler_blob------ 2823 // 2824 // Generate a special Compile2Runtime blob that saves all registers, 2825 // and setup oopmap. 2826 // 2827 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) { 2828 assert(StubRoutines::forward_exception_entry() != NULL, 2829 "must be generated before"); 2830 2831 ResourceMark rm; 2832 OopMapSet *oop_maps = new OopMapSet(); 2833 OopMap* map; 2834 2835 // Allocate space for the code. Setup code generation tools. 2836 CodeBuffer buffer("handler_blob", 2048, 1024); 2837 MacroAssembler* masm = new MacroAssembler(&buffer); 2838 2839 address start = __ pc(); 2840 address call_pc = NULL; 2841 int frame_size_in_words; 2842 bool cause_return = (poll_type == POLL_AT_RETURN); 2843 bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP); 2844 2845 if (UseRTMLocking) { 2846 // Abort RTM transaction before calling runtime 2847 // because critical section will be large and will be 2848 // aborted anyway. Also nmethod could be deoptimized. 2849 __ xabort(0); 2850 } 2851 2852 // Make room for return address (or push it again) 2853 if (!cause_return) { 2854 __ push(rbx); 2855 } 2856 2857 // Save registers, fpu state, and flags 2858 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors); 2859 2860 // The following is basically a call_VM. However, we need the precise 2861 // address of the call in order to generate an oopmap. Hence, we do all the 2862 // work ourselves. 2863 2864 __ set_last_Java_frame(noreg, noreg, NULL); 2865 2866 // The return address must always be correct so that frame constructor never 2867 // sees an invalid pc. 2868 2869 if (!cause_return) { 2870 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack. 2871 // Additionally, rbx is a callee saved register and we can look at it later to determine 2872 // if someone changed the return address for us! 2873 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset())); 2874 __ movptr(Address(rbp, wordSize), rbx); 2875 } 2876 2877 // Do the call 2878 __ mov(c_rarg0, r15_thread); 2879 __ call(RuntimeAddress(call_ptr)); 2880 2881 // Set an oopmap for the call site. This oopmap will map all 2882 // oop-registers and debug-info registers as callee-saved. This 2883 // will allow deoptimization at this safepoint to find all possible 2884 // debug-info recordings, as well as let GC find all oops. 2885 2886 oop_maps->add_gc_map( __ pc() - start, map); 2887 2888 Label noException; 2889 2890 __ reset_last_Java_frame(false); 2891 2892 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD); 2893 __ jcc(Assembler::equal, noException); 2894 2895 // Exception pending 2896 2897 RegisterSaver::restore_live_registers(masm, save_vectors); 2898 2899 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 2900 2901 // No exception case 2902 __ bind(noException); 2903 2904 Label no_adjust; 2905 #ifdef ASSERT 2906 Label bail; 2907 #endif 2908 if (!cause_return) { 2909 Label no_prefix, not_special; 2910 2911 // If our stashed return pc was modified by the runtime we avoid touching it 2912 __ cmpptr(rbx, Address(rbp, wordSize)); 2913 __ jccb(Assembler::notEqual, no_adjust); 2914 2915 // Skip over the poll instruction. 2916 // See NativeInstruction::is_safepoint_poll() 2917 // Possible encodings: 2918 // 85 00 test %eax,(%rax) 2919 // 85 01 test %eax,(%rcx) 2920 // 85 02 test %eax,(%rdx) 2921 // 85 03 test %eax,(%rbx) 2922 // 85 06 test %eax,(%rsi) 2923 // 85 07 test %eax,(%rdi) 2924 // 2925 // 41 85 00 test %eax,(%r8) 2926 // 41 85 01 test %eax,(%r9) 2927 // 41 85 02 test %eax,(%r10) 2928 // 41 85 03 test %eax,(%r11) 2929 // 41 85 06 test %eax,(%r14) 2930 // 41 85 07 test %eax,(%r15) 2931 // 2932 // 85 04 24 test %eax,(%rsp) 2933 // 41 85 04 24 test %eax,(%r12) 2934 // 85 45 00 test %eax,0x0(%rbp) 2935 // 41 85 45 00 test %eax,0x0(%r13) 2936 2937 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix); 2938 __ jcc(Assembler::notEqual, no_prefix); 2939 __ addptr(rbx, 1); 2940 __ bind(no_prefix); 2941 #ifdef ASSERT 2942 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below 2943 #endif 2944 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values: 2945 // r12/rsp 0x04 2946 // r13/rbp 0x05 2947 __ movzbq(rcx, Address(rbx, 1)); 2948 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05 2949 __ subptr(rcx, 4); // looking for 0x00 .. 0x01 2950 __ cmpptr(rcx, 1); 2951 __ jcc(Assembler::above, not_special); 2952 __ addptr(rbx, 1); 2953 __ bind(not_special); 2954 #ifdef ASSERT 2955 // Verify the correct encoding of the poll we're about to skip. 2956 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl); 2957 __ jcc(Assembler::notEqual, bail); 2958 // Mask out the modrm bits 2959 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask); 2960 // rax encodes to 0, so if the bits are nonzero it's incorrect 2961 __ jcc(Assembler::notZero, bail); 2962 #endif 2963 // Adjust return pc forward to step over the safepoint poll instruction 2964 __ addptr(rbx, 2); 2965 __ movptr(Address(rbp, wordSize), rbx); 2966 } 2967 2968 __ bind(no_adjust); 2969 // Normal exit, restore registers and exit. 2970 RegisterSaver::restore_live_registers(masm, save_vectors); 2971 __ ret(0); 2972 2973 #ifdef ASSERT 2974 __ bind(bail); 2975 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected"); 2976 #endif 2977 2978 // Make sure all code is generated 2979 masm->flush(); 2980 2981 // Fill-out other meta info 2982 return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words); 2983 } 2984 2985 // 2986 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss 2987 // 2988 // Generate a stub that calls into vm to find out the proper destination 2989 // of a java call. All the argument registers are live at this point 2990 // but since this is generic code we don't know what they are and the caller 2991 // must do any gc of the args. 2992 // 2993 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) { 2994 assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before"); 2995 2996 // allocate space for the code 2997 ResourceMark rm; 2998 2999 CodeBuffer buffer(name, 1200, 512); 3000 MacroAssembler* masm = new MacroAssembler(&buffer); 3001 3002 int frame_size_in_words; 3003 3004 OopMapSet *oop_maps = new OopMapSet(); 3005 OopMap* map = NULL; 3006 3007 int start = __ offset(); 3008 3009 // No need to save vector registers since they are caller-saved anyway. 3010 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ false); 3011 3012 int frame_complete = __ offset(); 3013 3014 __ set_last_Java_frame(noreg, noreg, NULL); 3015 3016 __ mov(c_rarg0, r15_thread); 3017 3018 __ call(RuntimeAddress(destination)); 3019 3020 3021 // Set an oopmap for the call site. 3022 // We need this not only for callee-saved registers, but also for volatile 3023 // registers that the compiler might be keeping live across a safepoint. 3024 3025 oop_maps->add_gc_map( __ offset() - start, map); 3026 3027 // rax contains the address we are going to jump to assuming no exception got installed 3028 3029 // clear last_Java_sp 3030 __ reset_last_Java_frame(false); 3031 // check for pending exceptions 3032 Label pending; 3033 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD); 3034 __ jcc(Assembler::notEqual, pending); 3035 3036 // get the returned Method* 3037 __ get_vm_result_2(rbx, r15_thread); 3038 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx); 3039 3040 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 3041 3042 RegisterSaver::restore_live_registers(masm); 3043 3044 // We are back the the original state on entry and ready to go. 3045 3046 __ jmp(rax); 3047 3048 // Pending exception after the safepoint 3049 3050 __ bind(pending); 3051 3052 RegisterSaver::restore_live_registers(masm); 3053 3054 // exception pending => remove activation and forward to exception handler 3055 3056 __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD); 3057 3058 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 3059 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3060 3061 // ------------- 3062 // make sure all code is generated 3063 masm->flush(); 3064 3065 // return the blob 3066 // frame_size_words or bytes?? 3067 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); 3068 } 3069 3070 #ifdef COMPILER2 3071 static const int native_invoker_code_size = MethodHandles::adapter_code_size; 3072 3073 class NativeInvokerGenerator : public StubCodeGenerator { 3074 address _call_target; 3075 int _shadow_space_bytes; 3076 3077 const GrowableArray<VMReg>& _input_registers; 3078 const GrowableArray<VMReg>& _output_registers; 3079 3080 int _frame_complete; 3081 int _framesize; 3082 OopMapSet* _oop_maps; 3083 public: 3084 NativeInvokerGenerator(CodeBuffer* buffer, 3085 address call_target, 3086 int shadow_space_bytes, 3087 const GrowableArray<VMReg>& input_registers, 3088 const GrowableArray<VMReg>& output_registers) 3089 : StubCodeGenerator(buffer, PrintMethodHandleStubs), 3090 _call_target(call_target), 3091 _shadow_space_bytes(shadow_space_bytes), 3092 _input_registers(input_registers), 3093 _output_registers(output_registers), 3094 _frame_complete(0), 3095 _framesize(0), 3096 _oop_maps(NULL) { 3097 assert(_output_registers.length() <= 1 3098 || (_output_registers.length() == 2 && !_output_registers.at(1)->is_valid()), "no multi-reg returns"); 3099 3100 } 3101 3102 void generate(); 3103 3104 int spill_size_in_bytes() const { 3105 if (_output_registers.length() == 0) { 3106 return 0; 3107 } 3108 VMReg reg = _output_registers.at(0); 3109 assert(reg->is_reg(), "must be a register"); 3110 if (reg->is_Register()) { 3111 return 8; 3112 } else if (reg->is_XMMRegister()) { 3113 if (UseAVX >= 3) { 3114 return 64; 3115 } else if (UseAVX >= 1) { 3116 return 32; 3117 } else { 3118 return 16; 3119 } 3120 } else { 3121 ShouldNotReachHere(); 3122 } 3123 return 0; 3124 } 3125 3126 void spill_out_registers() { 3127 if (_output_registers.length() == 0) { 3128 return; 3129 } 3130 VMReg reg = _output_registers.at(0); 3131 assert(reg->is_reg(), "must be a register"); 3132 MacroAssembler* masm = _masm; 3133 if (reg->is_Register()) { 3134 __ movptr(Address(rsp, 0), reg->as_Register()); 3135 } else if (reg->is_XMMRegister()) { 3136 if (UseAVX >= 3) { 3137 __ evmovdqul(Address(rsp, 0), reg->as_XMMRegister(), Assembler::AVX_512bit); 3138 } else if (UseAVX >= 1) { 3139 __ vmovdqu(Address(rsp, 0), reg->as_XMMRegister()); 3140 } else { 3141 __ movdqu(Address(rsp, 0), reg->as_XMMRegister()); 3142 } 3143 } else { 3144 ShouldNotReachHere(); 3145 } 3146 } 3147 3148 void fill_out_registers() { 3149 if (_output_registers.length() == 0) { 3150 return; 3151 } 3152 VMReg reg = _output_registers.at(0); 3153 assert(reg->is_reg(), "must be a register"); 3154 MacroAssembler* masm = _masm; 3155 if (reg->is_Register()) { 3156 __ movptr(reg->as_Register(), Address(rsp, 0)); 3157 } else if (reg->is_XMMRegister()) { 3158 if (UseAVX >= 3) { 3159 __ evmovdqul(reg->as_XMMRegister(), Address(rsp, 0), Assembler::AVX_512bit); 3160 } else if (UseAVX >= 1) { 3161 __ vmovdqu(reg->as_XMMRegister(), Address(rsp, 0)); 3162 } else { 3163 __ movdqu(reg->as_XMMRegister(), Address(rsp, 0)); 3164 } 3165 } else { 3166 ShouldNotReachHere(); 3167 } 3168 } 3169 3170 int frame_complete() const { 3171 return _frame_complete; 3172 } 3173 3174 int framesize() const { 3175 return (_framesize >> (LogBytesPerWord - LogBytesPerInt)); 3176 } 3177 3178 OopMapSet* oop_maps() const { 3179 return _oop_maps; 3180 } 3181 3182 private: 3183 #ifdef ASSERT 3184 bool target_uses_register(VMReg reg) { 3185 return _input_registers.contains(reg) || _output_registers.contains(reg); 3186 } 3187 #endif 3188 }; 3189 3190 RuntimeStub* SharedRuntime::make_native_invoker(address call_target, 3191 int shadow_space_bytes, 3192 const GrowableArray<VMReg>& input_registers, 3193 const GrowableArray<VMReg>& output_registers) { 3194 int locs_size = 64; 3195 CodeBuffer code("nep_invoker_blob", native_invoker_code_size, locs_size); 3196 NativeInvokerGenerator g(&code, call_target, shadow_space_bytes, input_registers, output_registers); 3197 g.generate(); 3198 code.log_section_sizes("nep_invoker_blob"); 3199 3200 RuntimeStub* stub = 3201 RuntimeStub::new_runtime_stub("nep_invoker_blob", 3202 &code, 3203 g.frame_complete(), 3204 g.framesize(), 3205 g.oop_maps(), false); 3206 return stub; 3207 } 3208 3209 void NativeInvokerGenerator::generate() { 3210 assert(!(target_uses_register(r15_thread->as_VMReg()) || target_uses_register(rscratch1->as_VMReg())), "Register conflict"); 3211 3212 enum layout { 3213 rbp_off, 3214 rbp_off2, 3215 return_off, 3216 return_off2, 3217 framesize // inclusive of return address 3218 }; 3219 3220 _framesize = align_up(framesize + ((_shadow_space_bytes + spill_size_in_bytes()) >> LogBytesPerInt), 4); 3221 assert(is_even(_framesize/2), "sp not 16-byte aligned"); 3222 3223 _oop_maps = new OopMapSet(); 3224 MacroAssembler* masm = _masm; 3225 3226 address start = __ pc(); 3227 3228 __ enter(); 3229 3230 // return address and rbp are already in place 3231 __ subptr(rsp, (_framesize-4) << LogBytesPerInt); // prolog 3232 3233 _frame_complete = __ pc() - start; 3234 3235 address the_pc = __ pc(); 3236 3237 __ set_last_Java_frame(rsp, rbp, (address)the_pc); 3238 OopMap* map = new OopMap(_framesize, 0); 3239 _oop_maps->add_gc_map(the_pc - start, map); 3240 3241 // State transition 3242 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 3243 3244 __ call(RuntimeAddress(_call_target)); 3245 3246 __ restore_cpu_control_state_after_jni(); 3247 3248 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 3249 3250 // Force this write out before the read below 3251 __ membar(Assembler::Membar_mask_bits( 3252 Assembler::LoadLoad | Assembler::LoadStore | 3253 Assembler::StoreLoad | Assembler::StoreStore)); 3254 3255 Label L_after_safepoint_poll; 3256 Label L_safepoint_poll_slow_path; 3257 3258 __ safepoint_poll(L_safepoint_poll_slow_path, r15_thread, true /* at_return */, false /* in_nmethod */); 3259 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 3260 __ jcc(Assembler::notEqual, L_safepoint_poll_slow_path); 3261 3262 __ bind(L_after_safepoint_poll); 3263 3264 // change thread state 3265 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 3266 3267 __ block_comment("reguard stack check"); 3268 Label L_reguard; 3269 Label L_after_reguard; 3270 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 3271 __ jcc(Assembler::equal, L_reguard); 3272 __ bind(L_after_reguard); 3273 3274 __ reset_last_Java_frame(r15_thread, true); 3275 3276 __ leave(); // required for proper stackwalking of RuntimeStub frame 3277 __ ret(0); 3278 3279 ////////////////////////////////////////////////////////////////////////////// 3280 3281 __ block_comment("{ L_safepoint_poll_slow_path"); 3282 __ bind(L_safepoint_poll_slow_path); 3283 __ vzeroupper(); 3284 3285 spill_out_registers(); 3286 3287 __ mov(c_rarg0, r15_thread); 3288 __ mov(r12, rsp); // remember sp 3289 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 3290 __ andptr(rsp, -16); // align stack as required by ABI 3291 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 3292 __ mov(rsp, r12); // restore sp 3293 __ reinit_heapbase(); 3294 3295 fill_out_registers(); 3296 3297 __ jmp(L_after_safepoint_poll); 3298 __ block_comment("} L_safepoint_poll_slow_path"); 3299 3300 ////////////////////////////////////////////////////////////////////////////// 3301 3302 __ block_comment("{ L_reguard"); 3303 __ bind(L_reguard); 3304 __ vzeroupper(); 3305 3306 spill_out_registers(); 3307 3308 __ mov(r12, rsp); // remember sp 3309 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 3310 __ andptr(rsp, -16); // align stack as required by ABI 3311 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 3312 __ mov(rsp, r12); // restore sp 3313 __ reinit_heapbase(); 3314 3315 fill_out_registers(); 3316 3317 __ jmp(L_after_reguard); 3318 3319 __ block_comment("} L_reguard"); 3320 3321 ////////////////////////////////////////////////////////////////////////////// 3322 3323 __ flush(); 3324 } 3325 #endif // COMPILER2 3326 3327 //------------------------------Montgomery multiplication------------------------ 3328 // 3329 3330 #ifndef _WINDOWS 3331 3332 // Subtract 0:b from carry:a. Return carry. 3333 static julong 3334 sub(julong a[], julong b[], julong carry, long len) { 3335 long long i = 0, cnt = len; 3336 julong tmp; 3337 asm volatile("clc; " 3338 "0: ; " 3339 "mov (%[b], %[i], 8), %[tmp]; " 3340 "sbb %[tmp], (%[a], %[i], 8); " 3341 "inc %[i]; dec %[cnt]; " 3342 "jne 0b; " 3343 "mov %[carry], %[tmp]; sbb $0, %[tmp]; " 3344 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp) 3345 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry) 3346 : "memory"); 3347 return tmp; 3348 } 3349 3350 // Multiply (unsigned) Long A by Long B, accumulating the double- 3351 // length result into the accumulator formed of T0, T1, and T2. 3352 #define MACC(A, B, T0, T1, T2) \ 3353 do { \ 3354 unsigned long hi, lo; \ 3355 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3356 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3357 : "r"(A), "a"(B) : "cc"); \ 3358 } while(0) 3359 3360 // As above, but add twice the double-length result into the 3361 // accumulator. 3362 #define MACC2(A, B, T0, T1, T2) \ 3363 do { \ 3364 unsigned long hi, lo; \ 3365 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \ 3366 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3367 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3368 : "r"(A), "a"(B) : "cc"); \ 3369 } while(0) 3370 3371 #else //_WINDOWS 3372 3373 static julong 3374 sub(julong a[], julong b[], julong carry, long len) { 3375 long i; 3376 julong tmp; 3377 unsigned char c = 1; 3378 for (i = 0; i < len; i++) { 3379 c = _addcarry_u64(c, a[i], ~b[i], &tmp); 3380 a[i] = tmp; 3381 } 3382 c = _addcarry_u64(c, carry, ~0, &tmp); 3383 return tmp; 3384 } 3385 3386 // Multiply (unsigned) Long A by Long B, accumulating the double- 3387 // length result into the accumulator formed of T0, T1, and T2. 3388 #define MACC(A, B, T0, T1, T2) \ 3389 do { \ 3390 julong hi, lo; \ 3391 lo = _umul128(A, B, &hi); \ 3392 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3393 c = _addcarry_u64(c, hi, T1, &T1); \ 3394 _addcarry_u64(c, T2, 0, &T2); \ 3395 } while(0) 3396 3397 // As above, but add twice the double-length result into the 3398 // accumulator. 3399 #define MACC2(A, B, T0, T1, T2) \ 3400 do { \ 3401 julong hi, lo; \ 3402 lo = _umul128(A, B, &hi); \ 3403 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3404 c = _addcarry_u64(c, hi, T1, &T1); \ 3405 _addcarry_u64(c, T2, 0, &T2); \ 3406 c = _addcarry_u64(0, lo, T0, &T0); \ 3407 c = _addcarry_u64(c, hi, T1, &T1); \ 3408 _addcarry_u64(c, T2, 0, &T2); \ 3409 } while(0) 3410 3411 #endif //_WINDOWS 3412 3413 // Fast Montgomery multiplication. The derivation of the algorithm is 3414 // in A Cryptographic Library for the Motorola DSP56000, 3415 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3416 3417 static void NOINLINE 3418 montgomery_multiply(julong a[], julong b[], julong n[], 3419 julong m[], julong inv, int len) { 3420 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3421 int i; 3422 3423 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply"); 3424 3425 for (i = 0; i < len; i++) { 3426 int j; 3427 for (j = 0; j < i; j++) { 3428 MACC(a[j], b[i-j], t0, t1, t2); 3429 MACC(m[j], n[i-j], t0, t1, t2); 3430 } 3431 MACC(a[i], b[0], t0, t1, t2); 3432 m[i] = t0 * inv; 3433 MACC(m[i], n[0], t0, t1, t2); 3434 3435 assert(t0 == 0, "broken Montgomery multiply"); 3436 3437 t0 = t1; t1 = t2; t2 = 0; 3438 } 3439 3440 for (i = len; i < 2*len; i++) { 3441 int j; 3442 for (j = i-len+1; j < len; j++) { 3443 MACC(a[j], b[i-j], t0, t1, t2); 3444 MACC(m[j], n[i-j], t0, t1, t2); 3445 } 3446 m[i-len] = t0; 3447 t0 = t1; t1 = t2; t2 = 0; 3448 } 3449 3450 while (t0) 3451 t0 = sub(m, n, t0, len); 3452 } 3453 3454 // Fast Montgomery squaring. This uses asymptotically 25% fewer 3455 // multiplies so it should be up to 25% faster than Montgomery 3456 // multiplication. However, its loop control is more complex and it 3457 // may actually run slower on some machines. 3458 3459 static void NOINLINE 3460 montgomery_square(julong a[], julong n[], 3461 julong m[], julong inv, int len) { 3462 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3463 int i; 3464 3465 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square"); 3466 3467 for (i = 0; i < len; i++) { 3468 int j; 3469 int end = (i+1)/2; 3470 for (j = 0; j < end; j++) { 3471 MACC2(a[j], a[i-j], t0, t1, t2); 3472 MACC(m[j], n[i-j], t0, t1, t2); 3473 } 3474 if ((i & 1) == 0) { 3475 MACC(a[j], a[j], t0, t1, t2); 3476 } 3477 for (; j < i; j++) { 3478 MACC(m[j], n[i-j], t0, t1, t2); 3479 } 3480 m[i] = t0 * inv; 3481 MACC(m[i], n[0], t0, t1, t2); 3482 3483 assert(t0 == 0, "broken Montgomery square"); 3484 3485 t0 = t1; t1 = t2; t2 = 0; 3486 } 3487 3488 for (i = len; i < 2*len; i++) { 3489 int start = i-len+1; 3490 int end = start + (len - start)/2; 3491 int j; 3492 for (j = start; j < end; j++) { 3493 MACC2(a[j], a[i-j], t0, t1, t2); 3494 MACC(m[j], n[i-j], t0, t1, t2); 3495 } 3496 if ((i & 1) == 0) { 3497 MACC(a[j], a[j], t0, t1, t2); 3498 } 3499 for (; j < len; j++) { 3500 MACC(m[j], n[i-j], t0, t1, t2); 3501 } 3502 m[i-len] = t0; 3503 t0 = t1; t1 = t2; t2 = 0; 3504 } 3505 3506 while (t0) 3507 t0 = sub(m, n, t0, len); 3508 } 3509 3510 // Swap words in a longword. 3511 static julong swap(julong x) { 3512 return (x << 32) | (x >> 32); 3513 } 3514 3515 // Copy len longwords from s to d, word-swapping as we go. The 3516 // destination array is reversed. 3517 static void reverse_words(julong *s, julong *d, int len) { 3518 d += len; 3519 while(len-- > 0) { 3520 d--; 3521 *d = swap(*s); 3522 s++; 3523 } 3524 } 3525 3526 // The threshold at which squaring is advantageous was determined 3527 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. 3528 #define MONTGOMERY_SQUARING_THRESHOLD 64 3529 3530 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, 3531 jint len, jlong inv, 3532 jint *m_ints) { 3533 assert(len % 2 == 0, "array length in montgomery_multiply must be even"); 3534 int longwords = len/2; 3535 3536 // Make very sure we don't use so much space that the stack might 3537 // overflow. 512 jints corresponds to an 16384-bit integer and 3538 // will use here a total of 8k bytes of stack space. 3539 int divisor = sizeof(julong) * 4; 3540 guarantee(longwords <= 8192 / divisor, "must be"); 3541 int total_allocation = longwords * sizeof (julong) * 4; 3542 julong *scratch = (julong *)alloca(total_allocation); 3543 3544 // Local scratch arrays 3545 julong 3546 *a = scratch + 0 * longwords, 3547 *b = scratch + 1 * longwords, 3548 *n = scratch + 2 * longwords, 3549 *m = scratch + 3 * longwords; 3550 3551 reverse_words((julong *)a_ints, a, longwords); 3552 reverse_words((julong *)b_ints, b, longwords); 3553 reverse_words((julong *)n_ints, n, longwords); 3554 3555 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords); 3556 3557 reverse_words(m, (julong *)m_ints, longwords); 3558 } 3559 3560 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, 3561 jint len, jlong inv, 3562 jint *m_ints) { 3563 assert(len % 2 == 0, "array length in montgomery_square must be even"); 3564 int longwords = len/2; 3565 3566 // Make very sure we don't use so much space that the stack might 3567 // overflow. 512 jints corresponds to an 16384-bit integer and 3568 // will use here a total of 6k bytes of stack space. 3569 int divisor = sizeof(julong) * 3; 3570 guarantee(longwords <= (8192 / divisor), "must be"); 3571 int total_allocation = longwords * sizeof (julong) * 3; 3572 julong *scratch = (julong *)alloca(total_allocation); 3573 3574 // Local scratch arrays 3575 julong 3576 *a = scratch + 0 * longwords, 3577 *n = scratch + 1 * longwords, 3578 *m = scratch + 2 * longwords; 3579 3580 reverse_words((julong *)a_ints, a, longwords); 3581 reverse_words((julong *)n_ints, n, longwords); 3582 3583 if (len >= MONTGOMERY_SQUARING_THRESHOLD) { 3584 ::montgomery_square(a, n, m, (julong)inv, longwords); 3585 } else { 3586 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords); 3587 } 3588 3589 reverse_words(m, (julong *)m_ints, longwords); 3590 } 3591 3592 #ifdef COMPILER2 3593 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame 3594 // 3595 //------------------------------generate_exception_blob--------------------------- 3596 // creates exception blob at the end 3597 // Using exception blob, this code is jumped from a compiled method. 3598 // (see emit_exception_handler in x86_64.ad file) 3599 // 3600 // Given an exception pc at a call we call into the runtime for the 3601 // handler in this method. This handler might merely restore state 3602 // (i.e. callee save registers) unwind the frame and jump to the 3603 // exception handler for the nmethod if there is no Java level handler 3604 // for the nmethod. 3605 // 3606 // This code is entered with a jmp. 3607 // 3608 // Arguments: 3609 // rax: exception oop 3610 // rdx: exception pc 3611 // 3612 // Results: 3613 // rax: exception oop 3614 // rdx: exception pc in caller or ??? 3615 // destination: exception handler of caller 3616 // 3617 // Note: the exception pc MUST be at a call (precise debug information) 3618 // Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved. 3619 // 3620 3621 void OptoRuntime::generate_exception_blob() { 3622 assert(!OptoRuntime::is_callee_saved_register(RDX_num), ""); 3623 assert(!OptoRuntime::is_callee_saved_register(RAX_num), ""); 3624 assert(!OptoRuntime::is_callee_saved_register(RCX_num), ""); 3625 3626 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 3627 3628 // Allocate space for the code 3629 ResourceMark rm; 3630 // Setup code generation tools 3631 CodeBuffer buffer("exception_blob", 2048, 1024); 3632 MacroAssembler* masm = new MacroAssembler(&buffer); 3633 3634 3635 address start = __ pc(); 3636 3637 // Exception pc is 'return address' for stack walker 3638 __ push(rdx); 3639 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog 3640 3641 // Save callee-saved registers. See x86_64.ad. 3642 3643 // rbp is an implicitly saved callee saved register (i.e., the calling 3644 // convention will save/restore it in the prolog/epilog). Other than that 3645 // there are no callee save registers now that adapter frames are gone. 3646 3647 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 3648 3649 // Store exception in Thread object. We cannot pass any arguments to the 3650 // handle_exception call, since we do not want to make any assumption 3651 // about the size of the frame where the exception happened in. 3652 // c_rarg0 is either rdi (Linux) or rcx (Windows). 3653 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax); 3654 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 3655 3656 // This call does all the hard work. It checks if an exception handler 3657 // exists in the method. 3658 // If so, it returns the handler address. 3659 // If not, it prepares for stack-unwinding, restoring the callee-save 3660 // registers of the frame being removed. 3661 // 3662 // address OptoRuntime::handle_exception_C(JavaThread* thread) 3663 3664 // At a method handle call, the stack may not be properly aligned 3665 // when returning with an exception. 3666 address the_pc = __ pc(); 3667 __ set_last_Java_frame(noreg, noreg, the_pc); 3668 __ mov(c_rarg0, r15_thread); 3669 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack 3670 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C))); 3671 3672 // Set an oopmap for the call site. This oopmap will only be used if we 3673 // are unwinding the stack. Hence, all locations will be dead. 3674 // Callee-saved registers will be the same as the frame above (i.e., 3675 // handle_exception_stub), since they were restored when we got the 3676 // exception. 3677 3678 OopMapSet* oop_maps = new OopMapSet(); 3679 3680 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 3681 3682 __ reset_last_Java_frame(false); 3683 3684 // Restore callee-saved registers 3685 3686 // rbp is an implicitly saved callee-saved register (i.e., the calling 3687 // convention will save restore it in prolog/epilog) Other than that 3688 // there are no callee save registers now that adapter frames are gone. 3689 3690 __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt)); 3691 3692 __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog 3693 __ pop(rdx); // No need for exception pc anymore 3694 3695 // rax: exception handler 3696 3697 // We have a handler in rax (could be deopt blob). 3698 __ mov(r8, rax); 3699 3700 // Get the exception oop 3701 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 3702 // Get the exception pc in case we are deoptimized 3703 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 3704 #ifdef ASSERT 3705 __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD); 3706 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD); 3707 #endif 3708 // Clear the exception oop so GC no longer processes it as a root. 3709 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD); 3710 3711 // rax: exception oop 3712 // r8: exception handler 3713 // rdx: exception pc 3714 // Jump to handler 3715 3716 __ jmp(r8); 3717 3718 // Make sure all code is generated 3719 masm->flush(); 3720 3721 // Set exception blob 3722 _exception_blob = ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1); 3723 } 3724 #endif // COMPILER2 3725 3726 void SharedRuntime::compute_move_order(const BasicType* in_sig_bt, 3727 int total_in_args, const VMRegPair* in_regs, 3728 int total_out_args, VMRegPair* out_regs, 3729 GrowableArray<int>& arg_order, 3730 VMRegPair tmp_vmreg) { 3731 ComputeMoveOrder order(total_in_args, in_regs, 3732 total_out_args, out_regs, 3733 in_sig_bt, arg_order, tmp_vmreg); 3734 }