1 /* 2 * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #ifndef _WINDOWS 27 #include "alloca.h" 28 #endif 29 #include "asm/macroAssembler.hpp" 30 #include "asm/macroAssembler.inline.hpp" 31 #include "code/compiledIC.hpp" 32 #include "code/debugInfoRec.hpp" 33 #include "code/nativeInst.hpp" 34 #include "code/vtableStubs.hpp" 35 #include "compiler/oopMap.hpp" 36 #include "gc/shared/collectedHeap.hpp" 37 #include "gc/shared/gcLocker.hpp" 38 #include "gc/shared/barrierSet.hpp" 39 #include "gc/shared/barrierSetAssembler.hpp" 40 #include "interpreter/interpreter.hpp" 41 #include "logging/log.hpp" 42 #include "memory/resourceArea.hpp" 43 #include "memory/universe.hpp" 44 #include "oops/klass.inline.hpp" 45 #include "oops/method.inline.hpp" 46 #include "prims/methodHandles.hpp" 47 #include "runtime/continuation.hpp" 48 #include "runtime/continuationEntry.inline.hpp" 49 #include "runtime/globals.hpp" 50 #include "runtime/jniHandles.hpp" 51 #include "runtime/safepointMechanism.hpp" 52 #include "runtime/sharedRuntime.hpp" 53 #include "runtime/signature.hpp" 54 #include "runtime/stubRoutines.hpp" 55 #include "runtime/vframeArray.hpp" 56 #include "runtime/vm_version.hpp" 57 #include "utilities/align.hpp" 58 #include "utilities/checkedCast.hpp" 59 #include "utilities/formatBuffer.hpp" 60 #include "vmreg_x86.inline.hpp" 61 #ifdef COMPILER1 62 #include "c1/c1_Runtime1.hpp" 63 #endif 64 #ifdef COMPILER2 65 #include "opto/runtime.hpp" 66 #endif 67 #if INCLUDE_JVMCI 68 #include "jvmci/jvmciJavaClasses.hpp" 69 #endif 70 71 #define __ masm-> 72 73 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size; 74 75 class SimpleRuntimeFrame { 76 77 public: 78 79 // Most of the runtime stubs have this simple frame layout. 80 // This class exists to make the layout shared in one place. 81 // Offsets are for compiler stack slots, which are jints. 82 enum layout { 83 // The frame sender code expects that rbp will be in the "natural" place and 84 // will override any oopMap setting for it. We must therefore force the layout 85 // so that it agrees with the frame sender code. 86 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt, 87 rbp_off2, 88 return_off, return_off2, 89 framesize 90 }; 91 }; 92 93 class RegisterSaver { 94 // Capture info about frame layout. Layout offsets are in jint 95 // units because compiler frame slots are jints. 96 #define XSAVE_AREA_BEGIN 160 97 #define XSAVE_AREA_YMM_BEGIN 576 98 #define XSAVE_AREA_OPMASK_BEGIN 1088 99 #define XSAVE_AREA_ZMM_BEGIN 1152 100 #define XSAVE_AREA_UPPERBANK 1664 101 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off 102 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off 103 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off 104 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off 105 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off 106 enum layout { 107 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area 108 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area 109 DEF_XMM_OFFS(0), 110 DEF_XMM_OFFS(1), 111 // 2..15 are implied in range usage 112 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 113 DEF_YMM_OFFS(0), 114 DEF_YMM_OFFS(1), 115 // 2..15 are implied in range usage 116 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 117 DEF_OPMASK_OFFS(0), 118 DEF_OPMASK_OFFS(1), 119 // 2..7 are implied in range usage 120 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 121 DEF_ZMM_OFFS(0), 122 DEF_ZMM_OFFS(1), 123 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt, 124 DEF_ZMM_UPPER_OFFS(16), 125 DEF_ZMM_UPPER_OFFS(17), 126 // 18..31 are implied in range usage 127 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt), 128 fpu_stateH_end, 129 r15_off, r15H_off, 130 r14_off, r14H_off, 131 r13_off, r13H_off, 132 r12_off, r12H_off, 133 r11_off, r11H_off, 134 r10_off, r10H_off, 135 r9_off, r9H_off, 136 r8_off, r8H_off, 137 rdi_off, rdiH_off, 138 rsi_off, rsiH_off, 139 ignore_off, ignoreH_off, // extra copy of rbp 140 rsp_off, rspH_off, 141 rbx_off, rbxH_off, 142 rdx_off, rdxH_off, 143 rcx_off, rcxH_off, 144 rax_off, raxH_off, 145 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state 146 align_off, alignH_off, 147 flags_off, flagsH_off, 148 // The frame sender code expects that rbp will be in the "natural" place and 149 // will override any oopMap setting for it. We must therefore force the layout 150 // so that it agrees with the frame sender code. 151 rbp_off, rbpH_off, // copy of rbp we will restore 152 return_off, returnH_off, // slot for return address 153 reg_save_size // size in compiler stack slots 154 }; 155 156 public: 157 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors); 158 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false); 159 160 // Offsets into the register save area 161 // Used by deoptimization when it is managing result register 162 // values on its own 163 164 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; } 165 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; } 166 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; } 167 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; } 168 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; } 169 170 // During deoptimization only the result registers need to be restored, 171 // all the other values have already been extracted. 172 static void restore_result_registers(MacroAssembler* masm); 173 }; 174 175 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) { 176 int off = 0; 177 int num_xmm_regs = XMMRegister::available_xmm_registers(); 178 #if COMPILER2_OR_JVMCI 179 if (save_wide_vectors && UseAVX == 0) { 180 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX 181 } 182 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 183 #else 184 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI 185 #endif 186 187 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated 188 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs); 189 // OopMap frame size is in compiler stack slots (jint's) not bytes or words 190 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 191 // CodeBlob frame size is in words. 192 int frame_size_in_words = frame_size_in_bytes / wordSize; 193 *total_frame_words = frame_size_in_words; 194 195 // Save registers, fpu state, and flags. 196 // We assume caller has already pushed the return address onto the 197 // stack, so rsp is 8-byte aligned here. 198 // We push rpb twice in this sequence because we want the real rbp 199 // to be under the return like a normal enter. 200 201 __ enter(); // rsp becomes 16-byte aligned here 202 __ push_CPU_state(); // Push a multiple of 16 bytes 203 204 // push cpu state handles this on EVEX enabled targets 205 if (save_wide_vectors) { 206 // Save upper half of YMM registers(0..15) 207 int base_addr = XSAVE_AREA_YMM_BEGIN; 208 for (int n = 0; n < 16; n++) { 209 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n)); 210 } 211 if (VM_Version::supports_evex()) { 212 // Save upper half of ZMM registers(0..15) 213 base_addr = XSAVE_AREA_ZMM_BEGIN; 214 for (int n = 0; n < 16; n++) { 215 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n)); 216 } 217 // Save full ZMM registers(16..num_xmm_regs) 218 base_addr = XSAVE_AREA_UPPERBANK; 219 off = 0; 220 int vector_len = Assembler::AVX_512bit; 221 for (int n = 16; n < num_xmm_regs; n++) { 222 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 223 } 224 #if COMPILER2_OR_JVMCI 225 base_addr = XSAVE_AREA_OPMASK_BEGIN; 226 off = 0; 227 for(int n = 0; n < KRegister::number_of_registers; n++) { 228 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 229 } 230 #endif 231 } 232 } else { 233 if (VM_Version::supports_evex()) { 234 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 235 int base_addr = XSAVE_AREA_UPPERBANK; 236 off = 0; 237 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 238 for (int n = 16; n < num_xmm_regs; n++) { 239 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 240 } 241 #if COMPILER2_OR_JVMCI 242 base_addr = XSAVE_AREA_OPMASK_BEGIN; 243 off = 0; 244 for(int n = 0; n < KRegister::number_of_registers; n++) { 245 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 246 } 247 #endif 248 } 249 } 250 __ vzeroupper(); 251 if (frame::arg_reg_save_area_bytes != 0) { 252 // Allocate argument register save area 253 __ subptr(rsp, frame::arg_reg_save_area_bytes); 254 } 255 256 // Set an oopmap for the call site. This oopmap will map all 257 // oop-registers and debug-info registers as callee-saved. This 258 // will allow deoptimization at this safepoint to find all possible 259 // debug-info recordings, as well as let GC find all oops. 260 261 OopMapSet *oop_maps = new OopMapSet(); 262 OopMap* map = new OopMap(frame_size_in_slots, 0); 263 264 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x)) 265 266 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg()); 267 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg()); 268 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg()); 269 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg()); 270 // rbp location is known implicitly by the frame sender code, needs no oopmap 271 // and the location where rbp was saved by is ignored 272 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg()); 273 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg()); 274 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg()); 275 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg()); 276 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg()); 277 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg()); 278 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg()); 279 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg()); 280 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg()); 281 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg()); 282 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 283 // on EVEX enabled targets, we get it included in the xsave area 284 off = xmm0_off; 285 int delta = xmm1_off - off; 286 for (int n = 0; n < 16; n++) { 287 XMMRegister xmm_name = as_XMMRegister(n); 288 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()); 289 off += delta; 290 } 291 if (UseAVX > 2) { 292 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 293 off = zmm16_off; 294 delta = zmm17_off - off; 295 for (int n = 16; n < num_xmm_regs; n++) { 296 XMMRegister zmm_name = as_XMMRegister(n); 297 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()); 298 off += delta; 299 } 300 } 301 302 #if COMPILER2_OR_JVMCI 303 if (save_wide_vectors) { 304 // Save upper half of YMM registers(0..15) 305 off = ymm0_off; 306 delta = ymm1_off - ymm0_off; 307 for (int n = 0; n < 16; n++) { 308 XMMRegister ymm_name = as_XMMRegister(n); 309 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4)); 310 off += delta; 311 } 312 if (VM_Version::supports_evex()) { 313 // Save upper half of ZMM registers(0..15) 314 off = zmm0_off; 315 delta = zmm1_off - zmm0_off; 316 for (int n = 0; n < 16; n++) { 317 XMMRegister zmm_name = as_XMMRegister(n); 318 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8)); 319 off += delta; 320 } 321 } 322 } 323 #endif // COMPILER2_OR_JVMCI 324 325 // %%% These should all be a waste but we'll keep things as they were for now 326 if (true) { 327 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next()); 328 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next()); 329 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next()); 330 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next()); 331 // rbp location is known implicitly by the frame sender code, needs no oopmap 332 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next()); 333 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next()); 334 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next()); 335 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next()); 336 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next()); 337 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next()); 338 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next()); 339 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next()); 340 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next()); 341 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next()); 342 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 343 // on EVEX enabled targets, we get it included in the xsave area 344 off = xmm0H_off; 345 delta = xmm1H_off - off; 346 for (int n = 0; n < 16; n++) { 347 XMMRegister xmm_name = as_XMMRegister(n); 348 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next()); 349 off += delta; 350 } 351 if (UseAVX > 2) { 352 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 353 off = zmm16H_off; 354 delta = zmm17H_off - off; 355 for (int n = 16; n < num_xmm_regs; n++) { 356 XMMRegister zmm_name = as_XMMRegister(n); 357 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next()); 358 off += delta; 359 } 360 } 361 } 362 363 return map; 364 } 365 366 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) { 367 int num_xmm_regs = XMMRegister::available_xmm_registers(); 368 if (frame::arg_reg_save_area_bytes != 0) { 369 // Pop arg register save area 370 __ addptr(rsp, frame::arg_reg_save_area_bytes); 371 } 372 373 #if COMPILER2_OR_JVMCI 374 if (restore_wide_vectors) { 375 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); 376 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 377 } 378 #else 379 assert(!restore_wide_vectors, "vectors are generated only by C2"); 380 #endif 381 382 __ vzeroupper(); 383 384 // On EVEX enabled targets everything is handled in pop fpu state 385 if (restore_wide_vectors) { 386 // Restore upper half of YMM registers (0..15) 387 int base_addr = XSAVE_AREA_YMM_BEGIN; 388 for (int n = 0; n < 16; n++) { 389 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16)); 390 } 391 if (VM_Version::supports_evex()) { 392 // Restore upper half of ZMM registers (0..15) 393 base_addr = XSAVE_AREA_ZMM_BEGIN; 394 for (int n = 0; n < 16; n++) { 395 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32)); 396 } 397 // Restore full ZMM registers(16..num_xmm_regs) 398 base_addr = XSAVE_AREA_UPPERBANK; 399 int vector_len = Assembler::AVX_512bit; 400 int off = 0; 401 for (int n = 16; n < num_xmm_regs; n++) { 402 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 403 } 404 #if COMPILER2_OR_JVMCI 405 base_addr = XSAVE_AREA_OPMASK_BEGIN; 406 off = 0; 407 for (int n = 0; n < KRegister::number_of_registers; n++) { 408 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 409 } 410 #endif 411 } 412 } else { 413 if (VM_Version::supports_evex()) { 414 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 415 int base_addr = XSAVE_AREA_UPPERBANK; 416 int off = 0; 417 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 418 for (int n = 16; n < num_xmm_regs; n++) { 419 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 420 } 421 #if COMPILER2_OR_JVMCI 422 base_addr = XSAVE_AREA_OPMASK_BEGIN; 423 off = 0; 424 for (int n = 0; n < KRegister::number_of_registers; n++) { 425 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 426 } 427 #endif 428 } 429 } 430 431 // Recover CPU state 432 __ pop_CPU_state(); 433 // Get the rbp described implicitly by the calling convention (no oopMap) 434 __ pop(rbp); 435 } 436 437 void RegisterSaver::restore_result_registers(MacroAssembler* masm) { 438 439 // Just restore result register. Only used by deoptimization. By 440 // now any callee save register that needs to be restored to a c2 441 // caller of the deoptee has been extracted into the vframeArray 442 // and will be stuffed into the c2i adapter we create for later 443 // restoration so only result registers need to be restored here. 444 445 // Restore fp result register 446 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes())); 447 // Restore integer result register 448 __ movptr(rax, Address(rsp, rax_offset_in_bytes())); 449 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes())); 450 451 // Pop all of the register save are off the stack except the return address 452 __ addptr(rsp, return_offset_in_bytes()); 453 } 454 455 // Is vector's size (in bytes) bigger than a size saved by default? 456 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions. 457 bool SharedRuntime::is_wide_vector(int size) { 458 return size > 16; 459 } 460 461 // --------------------------------------------------------------------------- 462 // Read the array of BasicTypes from a signature, and compute where the 463 // arguments should go. Values in the VMRegPair regs array refer to 4-byte 464 // quantities. Values less than VMRegImpl::stack0 are registers, those above 465 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer 466 // as framesizes are fixed. 467 // VMRegImpl::stack0 refers to the first slot 0(sp). 468 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher. 469 // Register up to Register::number_of_registers are the 64-bit 470 // integer registers. 471 472 // Note: the INPUTS in sig_bt are in units of Java argument words, which are 473 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit 474 // units regardless of build. Of course for i486 there is no 64 bit build 475 476 // The Java calling convention is a "shifted" version of the C ABI. 477 // By skipping the first C ABI register we can call non-static jni methods 478 // with small numbers of arguments without having to shuffle the arguments 479 // at all. Since we control the java ABI we ought to at least get some 480 // advantage out of it. 481 482 int SharedRuntime::java_calling_convention(const BasicType *sig_bt, 483 VMRegPair *regs, 484 int total_args_passed) { 485 486 // Create the mapping between argument positions and 487 // registers. 488 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = { 489 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5 490 }; 491 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = { 492 j_farg0, j_farg1, j_farg2, j_farg3, 493 j_farg4, j_farg5, j_farg6, j_farg7 494 }; 495 496 497 uint int_args = 0; 498 uint fp_args = 0; 499 uint stk_args = 0; 500 501 for (int i = 0; i < total_args_passed; i++) { 502 switch (sig_bt[i]) { 503 case T_BOOLEAN: 504 case T_CHAR: 505 case T_BYTE: 506 case T_SHORT: 507 case T_INT: 508 if (int_args < Argument::n_int_register_parameters_j) { 509 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 510 } else { 511 stk_args = align_up(stk_args, 2); 512 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 513 stk_args += 1; 514 } 515 break; 516 case T_VOID: 517 // halves of T_LONG or T_DOUBLE 518 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 519 regs[i].set_bad(); 520 break; 521 case T_LONG: 522 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 523 // fall through 524 case T_OBJECT: 525 case T_ARRAY: 526 case T_ADDRESS: 527 if (int_args < Argument::n_int_register_parameters_j) { 528 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 529 } else { 530 stk_args = align_up(stk_args, 2); 531 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 532 stk_args += 2; 533 } 534 break; 535 case T_FLOAT: 536 if (fp_args < Argument::n_float_register_parameters_j) { 537 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 538 } else { 539 stk_args = align_up(stk_args, 2); 540 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 541 stk_args += 1; 542 } 543 break; 544 case T_DOUBLE: 545 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 546 if (fp_args < Argument::n_float_register_parameters_j) { 547 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 548 } else { 549 stk_args = align_up(stk_args, 2); 550 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 551 stk_args += 2; 552 } 553 break; 554 default: 555 ShouldNotReachHere(); 556 break; 557 } 558 } 559 560 return stk_args; 561 } 562 563 // Patch the callers callsite with entry to compiled code if it exists. 564 static void patch_callers_callsite(MacroAssembler *masm) { 565 Label L; 566 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 567 __ jcc(Assembler::equal, L); 568 569 // Save the current stack pointer 570 __ mov(r13, rsp); 571 // Schedule the branch target address early. 572 // Call into the VM to patch the caller, then jump to compiled callee 573 // rax isn't live so capture return address while we easily can 574 __ movptr(rax, Address(rsp, 0)); 575 576 // align stack so push_CPU_state doesn't fault 577 __ andptr(rsp, -(StackAlignmentInBytes)); 578 __ push_CPU_state(); 579 __ vzeroupper(); 580 // VM needs caller's callsite 581 // VM needs target method 582 // This needs to be a long call since we will relocate this adapter to 583 // the codeBuffer and it may not reach 584 585 // Allocate argument register save area 586 if (frame::arg_reg_save_area_bytes != 0) { 587 __ subptr(rsp, frame::arg_reg_save_area_bytes); 588 } 589 __ mov(c_rarg0, rbx); 590 __ mov(c_rarg1, rax); 591 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite))); 592 593 // De-allocate argument register save area 594 if (frame::arg_reg_save_area_bytes != 0) { 595 __ addptr(rsp, frame::arg_reg_save_area_bytes); 596 } 597 598 __ vzeroupper(); 599 __ pop_CPU_state(); 600 // restore sp 601 __ mov(rsp, r13); 602 __ bind(L); 603 } 604 605 606 static void gen_c2i_adapter(MacroAssembler *masm, 607 int total_args_passed, 608 int comp_args_on_stack, 609 const BasicType *sig_bt, 610 const VMRegPair *regs, 611 Label& skip_fixup) { 612 // Before we get into the guts of the C2I adapter, see if we should be here 613 // at all. We've come from compiled code and are attempting to jump to the 614 // interpreter, which means the caller made a static call to get here 615 // (vcalls always get a compiled target if there is one). Check for a 616 // compiled target. If there is one, we need to patch the caller's call. 617 patch_callers_callsite(masm); 618 619 __ bind(skip_fixup); 620 621 // Since all args are passed on the stack, total_args_passed * 622 // Interpreter::stackElementSize is the space we need. 623 624 assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed); 625 626 int extraspace = (total_args_passed * Interpreter::stackElementSize); 627 628 // stack is aligned, keep it that way 629 // This is not currently needed or enforced by the interpreter, but 630 // we might as well conform to the ABI. 631 extraspace = align_up(extraspace, 2*wordSize); 632 633 // set senderSP value 634 __ lea(r13, Address(rsp, wordSize)); 635 636 #ifdef ASSERT 637 __ check_stack_alignment(r13, "sender stack not aligned"); 638 #endif 639 if (extraspace > 0) { 640 // Pop the return address 641 __ pop(rax); 642 643 __ subptr(rsp, extraspace); 644 645 // Push the return address 646 __ push(rax); 647 648 // Account for the return address location since we store it first rather 649 // than hold it in a register across all the shuffling 650 extraspace += wordSize; 651 } 652 653 #ifdef ASSERT 654 __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax); 655 #endif 656 657 // Now write the args into the outgoing interpreter space 658 for (int i = 0; i < total_args_passed; i++) { 659 if (sig_bt[i] == T_VOID) { 660 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 661 continue; 662 } 663 664 // offset to start parameters 665 int st_off = (total_args_passed - i) * Interpreter::stackElementSize; 666 int next_off = st_off - Interpreter::stackElementSize; 667 668 // Say 4 args: 669 // i st_off 670 // 0 32 T_LONG 671 // 1 24 T_VOID 672 // 2 16 T_OBJECT 673 // 3 8 T_BOOL 674 // - 0 return address 675 // 676 // However to make thing extra confusing. Because we can fit a long/double in 677 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter 678 // leaves one slot empty and only stores to a single slot. In this case the 679 // slot that is occupied is the T_VOID slot. See I said it was confusing. 680 681 VMReg r_1 = regs[i].first(); 682 VMReg r_2 = regs[i].second(); 683 if (!r_1->is_valid()) { 684 assert(!r_2->is_valid(), ""); 685 continue; 686 } 687 if (r_1->is_stack()) { 688 // memory to memory use rax 689 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace; 690 if (!r_2->is_valid()) { 691 // sign extend?? 692 __ movl(rax, Address(rsp, ld_off)); 693 __ movptr(Address(rsp, st_off), rax); 694 695 } else { 696 697 __ movq(rax, Address(rsp, ld_off)); 698 699 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 700 // T_DOUBLE and T_LONG use two slots in the interpreter 701 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 702 // ld_off == LSW, ld_off+wordSize == MSW 703 // st_off == MSW, next_off == LSW 704 __ movq(Address(rsp, next_off), rax); 705 #ifdef ASSERT 706 // Overwrite the unused slot with known junk 707 __ mov64(rax, CONST64(0xdeadffffdeadaaaa)); 708 __ movptr(Address(rsp, st_off), rax); 709 #endif /* ASSERT */ 710 } else { 711 __ movq(Address(rsp, st_off), rax); 712 } 713 } 714 } else if (r_1->is_Register()) { 715 Register r = r_1->as_Register(); 716 if (!r_2->is_valid()) { 717 // must be only an int (or less ) so move only 32bits to slot 718 // why not sign extend?? 719 __ movl(Address(rsp, st_off), r); 720 } else { 721 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 722 // T_DOUBLE and T_LONG use two slots in the interpreter 723 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 724 // long/double in gpr 725 #ifdef ASSERT 726 // Overwrite the unused slot with known junk 727 __ mov64(rax, CONST64(0xdeadffffdeadaaab)); 728 __ movptr(Address(rsp, st_off), rax); 729 #endif /* ASSERT */ 730 __ movq(Address(rsp, next_off), r); 731 } else { 732 __ movptr(Address(rsp, st_off), r); 733 } 734 } 735 } else { 736 assert(r_1->is_XMMRegister(), ""); 737 if (!r_2->is_valid()) { 738 // only a float use just part of the slot 739 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister()); 740 } else { 741 #ifdef ASSERT 742 // Overwrite the unused slot with known junk 743 __ mov64(rax, CONST64(0xdeadffffdeadaaac)); 744 __ movptr(Address(rsp, st_off), rax); 745 #endif /* ASSERT */ 746 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister()); 747 } 748 } 749 } 750 751 // Schedule the branch target address early. 752 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset()))); 753 __ jmp(rcx); 754 } 755 756 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg, 757 address code_start, address code_end, 758 Label& L_ok) { 759 Label L_fail; 760 __ lea(temp_reg, ExternalAddress(code_start)); 761 __ cmpptr(pc_reg, temp_reg); 762 __ jcc(Assembler::belowEqual, L_fail); 763 __ lea(temp_reg, ExternalAddress(code_end)); 764 __ cmpptr(pc_reg, temp_reg); 765 __ jcc(Assembler::below, L_ok); 766 __ bind(L_fail); 767 } 768 769 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm, 770 int total_args_passed, 771 int comp_args_on_stack, 772 const BasicType *sig_bt, 773 const VMRegPair *regs) { 774 775 // Note: r13 contains the senderSP on entry. We must preserve it since 776 // we may do a i2c -> c2i transition if we lose a race where compiled 777 // code goes non-entrant while we get args ready. 778 // In addition we use r13 to locate all the interpreter args as 779 // we must align the stack to 16 bytes on an i2c entry else we 780 // lose alignment we expect in all compiled code and register 781 // save code can segv when fxsave instructions find improperly 782 // aligned stack pointer. 783 784 // Adapters can be frameless because they do not require the caller 785 // to perform additional cleanup work, such as correcting the stack pointer. 786 // An i2c adapter is frameless because the *caller* frame, which is interpreted, 787 // routinely repairs its own stack pointer (from interpreter_frame_last_sp), 788 // even if a callee has modified the stack pointer. 789 // A c2i adapter is frameless because the *callee* frame, which is interpreted, 790 // routinely repairs its caller's stack pointer (from sender_sp, which is set 791 // up via the senderSP register). 792 // In other words, if *either* the caller or callee is interpreted, we can 793 // get the stack pointer repaired after a call. 794 // This is why c2i and i2c adapters cannot be indefinitely composed. 795 // In particular, if a c2i adapter were to somehow call an i2c adapter, 796 // both caller and callee would be compiled methods, and neither would 797 // clean up the stack pointer changes performed by the two adapters. 798 // If this happens, control eventually transfers back to the compiled 799 // caller, but with an uncorrected stack, causing delayed havoc. 800 801 if (VerifyAdapterCalls && 802 (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) { 803 // So, let's test for cascading c2i/i2c adapters right now. 804 // assert(Interpreter::contains($return_addr) || 805 // StubRoutines::contains($return_addr), 806 // "i2c adapter must return to an interpreter frame"); 807 __ block_comment("verify_i2c { "); 808 // Pick up the return address 809 __ movptr(rax, Address(rsp, 0)); 810 Label L_ok; 811 if (Interpreter::code() != nullptr) { 812 range_check(masm, rax, r11, 813 Interpreter::code()->code_start(), 814 Interpreter::code()->code_end(), 815 L_ok); 816 } 817 if (StubRoutines::initial_stubs_code() != nullptr) { 818 range_check(masm, rax, r11, 819 StubRoutines::initial_stubs_code()->code_begin(), 820 StubRoutines::initial_stubs_code()->code_end(), 821 L_ok); 822 } 823 if (StubRoutines::final_stubs_code() != nullptr) { 824 range_check(masm, rax, r11, 825 StubRoutines::final_stubs_code()->code_begin(), 826 StubRoutines::final_stubs_code()->code_end(), 827 L_ok); 828 } 829 const char* msg = "i2c adapter must return to an interpreter frame"; 830 __ block_comment(msg); 831 __ stop(msg); 832 __ bind(L_ok); 833 __ block_comment("} verify_i2ce "); 834 } 835 836 // Must preserve original SP for loading incoming arguments because 837 // we need to align the outgoing SP for compiled code. 838 __ movptr(r11, rsp); 839 840 // Pick up the return address 841 __ pop(rax); 842 843 // Convert 4-byte c2 stack slots to words. 844 int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord; 845 846 if (comp_args_on_stack) { 847 __ subptr(rsp, comp_words_on_stack * wordSize); 848 } 849 850 // Ensure compiled code always sees stack at proper alignment 851 __ andptr(rsp, -16); 852 853 // push the return address and misalign the stack that youngest frame always sees 854 // as far as the placement of the call instruction 855 __ push(rax); 856 857 // Put saved SP in another register 858 const Register saved_sp = rax; 859 __ movptr(saved_sp, r11); 860 861 // Will jump to the compiled code just as if compiled code was doing it. 862 // Pre-load the register-jump target early, to schedule it better. 863 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset()))); 864 865 #if INCLUDE_JVMCI 866 if (EnableJVMCI) { 867 // check if this call should be routed towards a specific entry point 868 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 869 Label no_alternative_target; 870 __ jcc(Assembler::equal, no_alternative_target); 871 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset()))); 872 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 873 __ bind(no_alternative_target); 874 } 875 #endif // INCLUDE_JVMCI 876 877 // Now generate the shuffle code. Pick up all register args and move the 878 // rest through the floating point stack top. 879 for (int i = 0; i < total_args_passed; i++) { 880 if (sig_bt[i] == T_VOID) { 881 // Longs and doubles are passed in native word order, but misaligned 882 // in the 32-bit build. 883 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 884 continue; 885 } 886 887 // Pick up 0, 1 or 2 words from SP+offset. 888 889 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), 890 "scrambled load targets?"); 891 // Load in argument order going down. 892 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize; 893 // Point to interpreter value (vs. tag) 894 int next_off = ld_off - Interpreter::stackElementSize; 895 // 896 // 897 // 898 VMReg r_1 = regs[i].first(); 899 VMReg r_2 = regs[i].second(); 900 if (!r_1->is_valid()) { 901 assert(!r_2->is_valid(), ""); 902 continue; 903 } 904 if (r_1->is_stack()) { 905 // Convert stack slot to an SP offset (+ wordSize to account for return address ) 906 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize; 907 908 // We can use r13 as a temp here because compiled code doesn't need r13 as an input 909 // and if we end up going thru a c2i because of a miss a reasonable value of r13 910 // will be generated. 911 if (!r_2->is_valid()) { 912 // sign extend??? 913 __ movl(r13, Address(saved_sp, ld_off)); 914 __ movptr(Address(rsp, st_off), r13); 915 } else { 916 // 917 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 918 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 919 // So we must adjust where to pick up the data to match the interpreter. 920 // 921 // Interpreter local[n] == MSW, local[n+1] == LSW however locals 922 // are accessed as negative so LSW is at LOW address 923 924 // ld_off is MSW so get LSW 925 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 926 next_off : ld_off; 927 __ movq(r13, Address(saved_sp, offset)); 928 // st_off is LSW (i.e. reg.first()) 929 __ movq(Address(rsp, st_off), r13); 930 } 931 } else if (r_1->is_Register()) { // Register argument 932 Register r = r_1->as_Register(); 933 assert(r != rax, "must be different"); 934 if (r_2->is_valid()) { 935 // 936 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 937 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 938 // So we must adjust where to pick up the data to match the interpreter. 939 940 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 941 next_off : ld_off; 942 943 // this can be a misaligned move 944 __ movq(r, Address(saved_sp, offset)); 945 } else { 946 // sign extend and use a full word? 947 __ movl(r, Address(saved_sp, ld_off)); 948 } 949 } else { 950 if (!r_2->is_valid()) { 951 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off)); 952 } else { 953 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off)); 954 } 955 } 956 } 957 958 __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about 959 960 // 6243940 We might end up in handle_wrong_method if 961 // the callee is deoptimized as we race thru here. If that 962 // happens we don't want to take a safepoint because the 963 // caller frame will look interpreted and arguments are now 964 // "compiled" so it is much better to make this transition 965 // invisible to the stack walking code. Unfortunately if 966 // we try and find the callee by normal means a safepoint 967 // is possible. So we stash the desired callee in the thread 968 // and the vm will find there should this case occur. 969 970 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx); 971 972 // put Method* where a c2i would expect should we end up there 973 // only needed because eof c2 resolve stubs return Method* as a result in 974 // rax 975 __ mov(rax, rbx); 976 __ jmp(r11); 977 } 978 979 // --------------------------------------------------------------- 980 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm, 981 int total_args_passed, 982 int comp_args_on_stack, 983 const BasicType *sig_bt, 984 const VMRegPair *regs, 985 AdapterFingerPrint* fingerprint) { 986 address i2c_entry = __ pc(); 987 988 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs); 989 990 // ------------------------------------------------------------------------- 991 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls 992 // to the interpreter. The args start out packed in the compiled layout. They 993 // need to be unpacked into the interpreter layout. This will almost always 994 // require some stack space. We grow the current (compiled) stack, then repack 995 // the args. We finally end in a jump to the generic interpreter entry point. 996 // On exit from the interpreter, the interpreter will restore our SP (lest the 997 // compiled code, which relies solely on SP and not RBP, get sick). 998 999 address c2i_unverified_entry = __ pc(); 1000 Label skip_fixup; 1001 1002 Register data = rax; 1003 Register receiver = j_rarg0; 1004 Register temp = rbx; 1005 1006 { 1007 __ ic_check(1 /* end_alignment */); 1008 __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset())); 1009 // Method might have been compiled since the call site was patched to 1010 // interpreted if that is the case treat it as a miss so we can get 1011 // the call site corrected. 1012 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 1013 __ jcc(Assembler::equal, skip_fixup); 1014 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1015 } 1016 1017 address c2i_entry = __ pc(); 1018 1019 // Class initialization barrier for static methods 1020 address c2i_no_clinit_check_entry = nullptr; 1021 if (VM_Version::supports_fast_class_init_checks()) { 1022 Label L_skip_barrier; 1023 Register method = rbx; 1024 1025 { // Bypass the barrier for non-static methods 1026 Register flags = rscratch1; 1027 __ movl(flags, Address(method, Method::access_flags_offset())); 1028 __ testl(flags, JVM_ACC_STATIC); 1029 __ jcc(Assembler::zero, L_skip_barrier); // non-static 1030 } 1031 1032 Register klass = rscratch1; 1033 __ load_method_holder(klass, method); 1034 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1035 1036 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1037 1038 __ bind(L_skip_barrier); 1039 c2i_no_clinit_check_entry = __ pc(); 1040 } 1041 1042 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1043 bs->c2i_entry_barrier(masm); 1044 1045 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup); 1046 1047 return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry); 1048 } 1049 1050 int SharedRuntime::c_calling_convention(const BasicType *sig_bt, 1051 VMRegPair *regs, 1052 int total_args_passed) { 1053 1054 // We return the amount of VMRegImpl stack slots we need to reserve for all 1055 // the arguments NOT counting out_preserve_stack_slots. 1056 1057 // NOTE: These arrays will have to change when c1 is ported 1058 #ifdef _WIN64 1059 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1060 c_rarg0, c_rarg1, c_rarg2, c_rarg3 1061 }; 1062 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1063 c_farg0, c_farg1, c_farg2, c_farg3 1064 }; 1065 #else 1066 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1067 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5 1068 }; 1069 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1070 c_farg0, c_farg1, c_farg2, c_farg3, 1071 c_farg4, c_farg5, c_farg6, c_farg7 1072 }; 1073 #endif // _WIN64 1074 1075 1076 uint int_args = 0; 1077 uint fp_args = 0; 1078 uint stk_args = 0; // inc by 2 each time 1079 1080 for (int i = 0; i < total_args_passed; i++) { 1081 switch (sig_bt[i]) { 1082 case T_BOOLEAN: 1083 case T_CHAR: 1084 case T_BYTE: 1085 case T_SHORT: 1086 case T_INT: 1087 if (int_args < Argument::n_int_register_parameters_c) { 1088 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 1089 #ifdef _WIN64 1090 fp_args++; 1091 // Allocate slots for callee to stuff register args the stack. 1092 stk_args += 2; 1093 #endif 1094 } else { 1095 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1096 stk_args += 2; 1097 } 1098 break; 1099 case T_LONG: 1100 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1101 // fall through 1102 case T_OBJECT: 1103 case T_ARRAY: 1104 case T_ADDRESS: 1105 case T_METADATA: 1106 if (int_args < Argument::n_int_register_parameters_c) { 1107 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 1108 #ifdef _WIN64 1109 fp_args++; 1110 stk_args += 2; 1111 #endif 1112 } else { 1113 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1114 stk_args += 2; 1115 } 1116 break; 1117 case T_FLOAT: 1118 if (fp_args < Argument::n_float_register_parameters_c) { 1119 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 1120 #ifdef _WIN64 1121 int_args++; 1122 // Allocate slots for callee to stuff register args the stack. 1123 stk_args += 2; 1124 #endif 1125 } else { 1126 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1127 stk_args += 2; 1128 } 1129 break; 1130 case T_DOUBLE: 1131 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1132 if (fp_args < Argument::n_float_register_parameters_c) { 1133 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 1134 #ifdef _WIN64 1135 int_args++; 1136 // Allocate slots for callee to stuff register args the stack. 1137 stk_args += 2; 1138 #endif 1139 } else { 1140 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1141 stk_args += 2; 1142 } 1143 break; 1144 case T_VOID: // Halves of longs and doubles 1145 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 1146 regs[i].set_bad(); 1147 break; 1148 default: 1149 ShouldNotReachHere(); 1150 break; 1151 } 1152 } 1153 #ifdef _WIN64 1154 // windows abi requires that we always allocate enough stack space 1155 // for 4 64bit registers to be stored down. 1156 if (stk_args < 8) { 1157 stk_args = 8; 1158 } 1159 #endif // _WIN64 1160 1161 return stk_args; 1162 } 1163 1164 int SharedRuntime::vector_calling_convention(VMRegPair *regs, 1165 uint num_bits, 1166 uint total_args_passed) { 1167 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512, 1168 "only certain vector sizes are supported for now"); 1169 1170 static const XMMRegister VEC_ArgReg[32] = { 1171 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 1172 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1173 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23, 1174 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31 1175 }; 1176 1177 uint stk_args = 0; 1178 uint fp_args = 0; 1179 1180 for (uint i = 0; i < total_args_passed; i++) { 1181 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg(); 1182 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15)); 1183 regs[i].set_pair(vmreg->next(next_val), vmreg); 1184 } 1185 1186 return stk_args; 1187 } 1188 1189 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1190 // We always ignore the frame_slots arg and just use the space just below frame pointer 1191 // which by this time is free to use 1192 switch (ret_type) { 1193 case T_FLOAT: 1194 __ movflt(Address(rbp, -wordSize), xmm0); 1195 break; 1196 case T_DOUBLE: 1197 __ movdbl(Address(rbp, -wordSize), xmm0); 1198 break; 1199 case T_VOID: break; 1200 default: { 1201 __ movptr(Address(rbp, -wordSize), rax); 1202 } 1203 } 1204 } 1205 1206 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1207 // We always ignore the frame_slots arg and just use the space just below frame pointer 1208 // which by this time is free to use 1209 switch (ret_type) { 1210 case T_FLOAT: 1211 __ movflt(xmm0, Address(rbp, -wordSize)); 1212 break; 1213 case T_DOUBLE: 1214 __ movdbl(xmm0, Address(rbp, -wordSize)); 1215 break; 1216 case T_VOID: break; 1217 default: { 1218 __ movptr(rax, Address(rbp, -wordSize)); 1219 } 1220 } 1221 } 1222 1223 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1224 for ( int i = first_arg ; i < arg_count ; i++ ) { 1225 if (args[i].first()->is_Register()) { 1226 __ push(args[i].first()->as_Register()); 1227 } else if (args[i].first()->is_XMMRegister()) { 1228 __ subptr(rsp, 2*wordSize); 1229 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister()); 1230 } 1231 } 1232 } 1233 1234 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1235 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) { 1236 if (args[i].first()->is_Register()) { 1237 __ pop(args[i].first()->as_Register()); 1238 } else if (args[i].first()->is_XMMRegister()) { 1239 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0)); 1240 __ addptr(rsp, 2*wordSize); 1241 } 1242 } 1243 } 1244 1245 static void verify_oop_args(MacroAssembler* masm, 1246 const methodHandle& method, 1247 const BasicType* sig_bt, 1248 const VMRegPair* regs) { 1249 Register temp_reg = rbx; // not part of any compiled calling seq 1250 if (VerifyOops) { 1251 for (int i = 0; i < method->size_of_parameters(); i++) { 1252 if (is_reference_type(sig_bt[i])) { 1253 VMReg r = regs[i].first(); 1254 assert(r->is_valid(), "bad oop arg"); 1255 if (r->is_stack()) { 1256 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1257 __ verify_oop(temp_reg); 1258 } else { 1259 __ verify_oop(r->as_Register()); 1260 } 1261 } 1262 } 1263 } 1264 } 1265 1266 static void check_continuation_enter_argument(VMReg actual_vmreg, 1267 Register expected_reg, 1268 const char* name) { 1269 assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name); 1270 assert(actual_vmreg->as_Register() == expected_reg, 1271 "%s is in unexpected register: %s instead of %s", 1272 name, actual_vmreg->as_Register()->name(), expected_reg->name()); 1273 } 1274 1275 1276 //---------------------------- continuation_enter_setup --------------------------- 1277 // 1278 // Arguments: 1279 // None. 1280 // 1281 // Results: 1282 // rsp: pointer to blank ContinuationEntry 1283 // 1284 // Kills: 1285 // rax 1286 // 1287 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) { 1288 assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, ""); 1289 assert(in_bytes(ContinuationEntry::cont_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1290 assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1291 1292 stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize; 1293 __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1294 1295 int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size; 1296 OopMap* map = new OopMap(frame_size, 0); 1297 1298 __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset())); 1299 __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax); 1300 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp); 1301 1302 return map; 1303 } 1304 1305 //---------------------------- fill_continuation_entry --------------------------- 1306 // 1307 // Arguments: 1308 // rsp: pointer to blank Continuation entry 1309 // reg_cont_obj: pointer to the continuation 1310 // reg_flags: flags 1311 // 1312 // Results: 1313 // rsp: pointer to filled out ContinuationEntry 1314 // 1315 // Kills: 1316 // rax 1317 // 1318 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) { 1319 assert_different_registers(rax, reg_cont_obj, reg_flags); 1320 #ifdef ASSERT 1321 __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value()); 1322 #endif 1323 __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj); 1324 __ movl (Address(rsp, ContinuationEntry::flags_offset()), reg_flags); 1325 __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0); 1326 __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0); 1327 __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0); 1328 1329 __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset())); 1330 __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax); 1331 __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset())); 1332 __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax); 1333 1334 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0); 1335 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0); 1336 } 1337 1338 //---------------------------- continuation_enter_cleanup --------------------------- 1339 // 1340 // Arguments: 1341 // rsp: pointer to the ContinuationEntry 1342 // 1343 // Results: 1344 // rsp: pointer to the spilled rbp in the entry frame 1345 // 1346 // Kills: 1347 // rbx 1348 // 1349 void static continuation_enter_cleanup(MacroAssembler* masm) { 1350 #ifdef ASSERT 1351 Label L_good_sp; 1352 __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1353 __ jcc(Assembler::equal, L_good_sp); 1354 __ stop("Incorrect rsp at continuation_enter_cleanup"); 1355 __ bind(L_good_sp); 1356 #endif 1357 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset())); 1358 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx); 1359 1360 if (CheckJNICalls) { 1361 // Check if this is a virtual thread continuation 1362 Label L_skip_vthread_code; 1363 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0); 1364 __ jcc(Assembler::equal, L_skip_vthread_code); 1365 1366 // If the held monitor count is > 0 and this vthread is terminating then 1367 // it failed to release a JNI monitor. So we issue the same log message 1368 // that JavaThread::exit does. 1369 __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1370 __ jcc(Assembler::equal, L_skip_vthread_code); 1371 1372 // rax may hold an exception oop, save it before the call 1373 __ push(rax); 1374 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held)); 1375 __ pop(rax); 1376 1377 // For vthreads we have to explicitly zero the JNI monitor count of the carrier 1378 // on termination. The held count is implicitly zeroed below when we restore from 1379 // the parent held count (which has to be zero). 1380 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1381 1382 __ bind(L_skip_vthread_code); 1383 } 1384 #ifdef ASSERT 1385 else { 1386 // Check if this is a virtual thread continuation 1387 Label L_skip_vthread_code; 1388 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0); 1389 __ jcc(Assembler::equal, L_skip_vthread_code); 1390 1391 // See comment just above. If not checking JNI calls the JNI count is only 1392 // needed for assertion checking. 1393 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1394 1395 __ bind(L_skip_vthread_code); 1396 } 1397 #endif 1398 1399 __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset())); 1400 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx); 1401 1402 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset())); 1403 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx); 1404 __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1405 } 1406 1407 static void gen_continuation_enter(MacroAssembler* masm, 1408 const VMRegPair* regs, 1409 int& exception_offset, 1410 OopMapSet* oop_maps, 1411 int& frame_complete, 1412 int& stack_slots, 1413 int& interpreted_entry_offset, 1414 int& compiled_entry_offset) { 1415 1416 // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread) 1417 int pos_cont_obj = 0; 1418 int pos_is_cont = 1; 1419 int pos_is_virtual = 2; 1420 1421 // The platform-specific calling convention may present the arguments in various registers. 1422 // To simplify the rest of the code, we expect the arguments to reside at these known 1423 // registers, and we additionally check the placement here in case calling convention ever 1424 // changes. 1425 Register reg_cont_obj = c_rarg1; 1426 Register reg_is_cont = c_rarg2; 1427 Register reg_is_virtual = c_rarg3; 1428 1429 check_continuation_enter_argument(regs[pos_cont_obj].first(), reg_cont_obj, "Continuation object"); 1430 check_continuation_enter_argument(regs[pos_is_cont].first(), reg_is_cont, "isContinue"); 1431 check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread"); 1432 1433 // Utility methods kill rax, make sure there are no collisions 1434 assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual); 1435 1436 AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(), 1437 relocInfo::static_call_type); 1438 1439 address start = __ pc(); 1440 1441 Label L_thaw, L_exit; 1442 1443 // i2i entry used at interp_only_mode only 1444 interpreted_entry_offset = __ pc() - start; 1445 { 1446 #ifdef ASSERT 1447 Label is_interp_only; 1448 __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0); 1449 __ jcc(Assembler::notEqual, is_interp_only); 1450 __ stop("enterSpecial interpreter entry called when not in interp_only_mode"); 1451 __ bind(is_interp_only); 1452 #endif 1453 1454 __ pop(rax); // return address 1455 // Read interpreter arguments into registers (this is an ad-hoc i2c adapter) 1456 __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2)); 1457 __ movl(c_rarg2, Address(rsp, Interpreter::stackElementSize*1)); 1458 __ movl(c_rarg3, Address(rsp, Interpreter::stackElementSize*0)); 1459 __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment 1460 __ push(rax); // return address 1461 __ push_cont_fastpath(); 1462 1463 __ enter(); 1464 1465 stack_slots = 2; // will be adjusted in setup 1466 OopMap* map = continuation_enter_setup(masm, stack_slots); 1467 // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe, 1468 // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway. 1469 1470 __ verify_oop(reg_cont_obj); 1471 1472 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1473 1474 // If continuation, call to thaw. Otherwise, resolve the call and exit. 1475 __ testptr(reg_is_cont, reg_is_cont); 1476 __ jcc(Assembler::notZero, L_thaw); 1477 1478 // --- Resolve path 1479 1480 // Make sure the call is patchable 1481 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1482 // Emit stub for static call 1483 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc()); 1484 if (stub == nullptr) { 1485 fatal("CodeCache is full at gen_continuation_enter"); 1486 } 1487 __ call(resolve); 1488 oop_maps->add_gc_map(__ pc() - start, map); 1489 __ post_call_nop(); 1490 1491 __ jmp(L_exit); 1492 } 1493 1494 // compiled entry 1495 __ align(CodeEntryAlignment); 1496 compiled_entry_offset = __ pc() - start; 1497 __ enter(); 1498 1499 stack_slots = 2; // will be adjusted in setup 1500 OopMap* map = continuation_enter_setup(masm, stack_slots); 1501 1502 // Frame is now completed as far as size and linkage. 1503 frame_complete = __ pc() - start; 1504 1505 __ verify_oop(reg_cont_obj); 1506 1507 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1508 1509 // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue) 1510 __ testptr(reg_is_cont, reg_is_cont); 1511 __ jccb(Assembler::notZero, L_thaw); 1512 1513 // --- call Continuation.enter(Continuation c, boolean isContinue) 1514 1515 // Make sure the call is patchable 1516 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1517 1518 // Emit stub for static call 1519 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc()); 1520 if (stub == nullptr) { 1521 fatal("CodeCache is full at gen_continuation_enter"); 1522 } 1523 1524 // The call needs to be resolved. There's a special case for this in 1525 // SharedRuntime::find_callee_info_helper() which calls 1526 // LinkResolver::resolve_continuation_enter() which resolves the call to 1527 // Continuation.enter(Continuation c, boolean isContinue). 1528 __ call(resolve); 1529 1530 oop_maps->add_gc_map(__ pc() - start, map); 1531 __ post_call_nop(); 1532 1533 __ jmpb(L_exit); 1534 1535 // --- Thawing path 1536 1537 __ bind(L_thaw); 1538 1539 __ call(RuntimeAddress(StubRoutines::cont_thaw())); 1540 1541 ContinuationEntry::_return_pc_offset = __ pc() - start; 1542 oop_maps->add_gc_map(__ pc() - start, map->deep_copy()); 1543 __ post_call_nop(); 1544 1545 // --- Normal exit (resolve/thawing) 1546 1547 __ bind(L_exit); 1548 1549 continuation_enter_cleanup(masm); 1550 __ pop(rbp); 1551 __ ret(0); 1552 1553 // --- Exception handling path 1554 1555 exception_offset = __ pc() - start; 1556 1557 continuation_enter_cleanup(masm); 1558 __ pop(rbp); 1559 1560 __ movptr(c_rarg0, r15_thread); 1561 __ movptr(c_rarg1, Address(rsp, 0)); // return address 1562 1563 // rax still holds the original exception oop, save it before the call 1564 __ push(rax); 1565 1566 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2); 1567 __ movptr(rbx, rax); 1568 1569 // Continue at exception handler: 1570 // rax: exception oop 1571 // rbx: exception handler 1572 // rdx: exception pc 1573 __ pop(rax); 1574 __ verify_oop(rax); 1575 __ pop(rdx); 1576 __ jmp(rbx); 1577 } 1578 1579 static void gen_continuation_yield(MacroAssembler* masm, 1580 const VMRegPair* regs, 1581 OopMapSet* oop_maps, 1582 int& frame_complete, 1583 int& stack_slots, 1584 int& compiled_entry_offset) { 1585 enum layout { 1586 rbp_off, 1587 rbpH_off, 1588 return_off, 1589 return_off2, 1590 framesize // inclusive of return address 1591 }; 1592 stack_slots = framesize / VMRegImpl::slots_per_word; 1593 assert(stack_slots == 2, "recheck layout"); 1594 1595 address start = __ pc(); 1596 compiled_entry_offset = __ pc() - start; 1597 __ enter(); 1598 address the_pc = __ pc(); 1599 1600 frame_complete = the_pc - start; 1601 1602 // This nop must be exactly at the PC we push into the frame info. 1603 // We use this nop for fast CodeBlob lookup, associate the OopMap 1604 // with it right away. 1605 __ post_call_nop(); 1606 OopMap* map = new OopMap(framesize, 1); 1607 oop_maps->add_gc_map(frame_complete, map); 1608 1609 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 1610 __ movptr(c_rarg0, r15_thread); 1611 __ movptr(c_rarg1, rsp); 1612 __ call_VM_leaf(Continuation::freeze_entry(), 2); 1613 __ reset_last_Java_frame(true); 1614 1615 Label L_pinned; 1616 1617 __ testptr(rax, rax); 1618 __ jcc(Assembler::notZero, L_pinned); 1619 1620 __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1621 continuation_enter_cleanup(masm); 1622 __ pop(rbp); 1623 __ ret(0); 1624 1625 __ bind(L_pinned); 1626 1627 // Pinned, return to caller 1628 1629 // handle pending exception thrown by freeze 1630 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 1631 Label ok; 1632 __ jcc(Assembler::equal, ok); 1633 __ leave(); 1634 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 1635 __ bind(ok); 1636 1637 __ leave(); 1638 __ ret(0); 1639 } 1640 1641 static void gen_special_dispatch(MacroAssembler* masm, 1642 const methodHandle& method, 1643 const BasicType* sig_bt, 1644 const VMRegPair* regs) { 1645 verify_oop_args(masm, method, sig_bt, regs); 1646 vmIntrinsics::ID iid = method->intrinsic_id(); 1647 1648 // Now write the args into the outgoing interpreter space 1649 bool has_receiver = false; 1650 Register receiver_reg = noreg; 1651 int member_arg_pos = -1; 1652 Register member_reg = noreg; 1653 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid); 1654 if (ref_kind != 0) { 1655 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument 1656 member_reg = rbx; // known to be free at this point 1657 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind); 1658 } else if (iid == vmIntrinsics::_invokeBasic) { 1659 has_receiver = true; 1660 } else if (iid == vmIntrinsics::_linkToNative) { 1661 member_arg_pos = method->size_of_parameters() - 1; // trailing NativeEntryPoint argument 1662 member_reg = rbx; // known to be free at this point 1663 } else { 1664 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid)); 1665 } 1666 1667 if (member_reg != noreg) { 1668 // Load the member_arg into register, if necessary. 1669 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs); 1670 VMReg r = regs[member_arg_pos].first(); 1671 if (r->is_stack()) { 1672 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1673 } else { 1674 // no data motion is needed 1675 member_reg = r->as_Register(); 1676 } 1677 } 1678 1679 if (has_receiver) { 1680 // Make sure the receiver is loaded into a register. 1681 assert(method->size_of_parameters() > 0, "oob"); 1682 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object"); 1683 VMReg r = regs[0].first(); 1684 assert(r->is_valid(), "bad receiver arg"); 1685 if (r->is_stack()) { 1686 // Porting note: This assumes that compiled calling conventions always 1687 // pass the receiver oop in a register. If this is not true on some 1688 // platform, pick a temp and load the receiver from stack. 1689 fatal("receiver always in a register"); 1690 receiver_reg = j_rarg0; // known to be free at this point 1691 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1692 } else { 1693 // no data motion is needed 1694 receiver_reg = r->as_Register(); 1695 } 1696 } 1697 1698 // Figure out which address we are really jumping to: 1699 MethodHandles::generate_method_handle_dispatch(masm, iid, 1700 receiver_reg, member_reg, /*for_compiler_entry:*/ true); 1701 } 1702 1703 // --------------------------------------------------------------------------- 1704 // Generate a native wrapper for a given method. The method takes arguments 1705 // in the Java compiled code convention, marshals them to the native 1706 // convention (handlizes oops, etc), transitions to native, makes the call, 1707 // returns to java state (possibly blocking), unhandlizes any result and 1708 // returns. 1709 // 1710 // Critical native functions are a shorthand for the use of 1711 // GetPrimtiveArrayCritical and disallow the use of any other JNI 1712 // functions. The wrapper is expected to unpack the arguments before 1713 // passing them to the callee. Critical native functions leave the state _in_Java, 1714 // since they cannot stop for GC. 1715 // Some other parts of JNI setup are skipped like the tear down of the JNI handle 1716 // block and the check for pending exceptions it's impossible for them 1717 // to be thrown. 1718 // 1719 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm, 1720 const methodHandle& method, 1721 int compile_id, 1722 BasicType* in_sig_bt, 1723 VMRegPair* in_regs, 1724 BasicType ret_type) { 1725 if (method->is_continuation_native_intrinsic()) { 1726 int exception_offset = -1; 1727 OopMapSet* oop_maps = new OopMapSet(); 1728 int frame_complete = -1; 1729 int stack_slots = -1; 1730 int interpreted_entry_offset = -1; 1731 int vep_offset = -1; 1732 if (method->is_continuation_enter_intrinsic()) { 1733 gen_continuation_enter(masm, 1734 in_regs, 1735 exception_offset, 1736 oop_maps, 1737 frame_complete, 1738 stack_slots, 1739 interpreted_entry_offset, 1740 vep_offset); 1741 } else if (method->is_continuation_yield_intrinsic()) { 1742 gen_continuation_yield(masm, 1743 in_regs, 1744 oop_maps, 1745 frame_complete, 1746 stack_slots, 1747 vep_offset); 1748 } else { 1749 guarantee(false, "Unknown Continuation native intrinsic"); 1750 } 1751 1752 #ifdef ASSERT 1753 if (method->is_continuation_enter_intrinsic()) { 1754 assert(interpreted_entry_offset != -1, "Must be set"); 1755 assert(exception_offset != -1, "Must be set"); 1756 } else { 1757 assert(interpreted_entry_offset == -1, "Must be unset"); 1758 assert(exception_offset == -1, "Must be unset"); 1759 } 1760 assert(frame_complete != -1, "Must be set"); 1761 assert(stack_slots != -1, "Must be set"); 1762 assert(vep_offset != -1, "Must be set"); 1763 #endif 1764 1765 __ flush(); 1766 nmethod* nm = nmethod::new_native_nmethod(method, 1767 compile_id, 1768 masm->code(), 1769 vep_offset, 1770 frame_complete, 1771 stack_slots, 1772 in_ByteSize(-1), 1773 in_ByteSize(-1), 1774 oop_maps, 1775 exception_offset); 1776 if (nm == nullptr) return nm; 1777 if (method->is_continuation_enter_intrinsic()) { 1778 ContinuationEntry::set_enter_code(nm, interpreted_entry_offset); 1779 } else if (method->is_continuation_yield_intrinsic()) { 1780 _cont_doYield_stub = nm; 1781 } 1782 return nm; 1783 } 1784 1785 if (method->is_method_handle_intrinsic()) { 1786 vmIntrinsics::ID iid = method->intrinsic_id(); 1787 intptr_t start = (intptr_t)__ pc(); 1788 int vep_offset = ((intptr_t)__ pc()) - start; 1789 gen_special_dispatch(masm, 1790 method, 1791 in_sig_bt, 1792 in_regs); 1793 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period 1794 __ flush(); 1795 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually 1796 return nmethod::new_native_nmethod(method, 1797 compile_id, 1798 masm->code(), 1799 vep_offset, 1800 frame_complete, 1801 stack_slots / VMRegImpl::slots_per_word, 1802 in_ByteSize(-1), 1803 in_ByteSize(-1), 1804 nullptr); 1805 } 1806 address native_func = method->native_function(); 1807 assert(native_func != nullptr, "must have function"); 1808 1809 // An OopMap for lock (and class if static) 1810 OopMapSet *oop_maps = new OopMapSet(); 1811 intptr_t start = (intptr_t)__ pc(); 1812 1813 // We have received a description of where all the java arg are located 1814 // on entry to the wrapper. We need to convert these args to where 1815 // the jni function will expect them. To figure out where they go 1816 // we convert the java signature to a C signature by inserting 1817 // the hidden arguments as arg[0] and possibly arg[1] (static method) 1818 1819 const int total_in_args = method->size_of_parameters(); 1820 int total_c_args = total_in_args + (method->is_static() ? 2 : 1); 1821 1822 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args); 1823 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args); 1824 BasicType* in_elem_bt = nullptr; 1825 1826 int argc = 0; 1827 out_sig_bt[argc++] = T_ADDRESS; 1828 if (method->is_static()) { 1829 out_sig_bt[argc++] = T_OBJECT; 1830 } 1831 1832 for (int i = 0; i < total_in_args ; i++ ) { 1833 out_sig_bt[argc++] = in_sig_bt[i]; 1834 } 1835 1836 // Now figure out where the args must be stored and how much stack space 1837 // they require. 1838 int out_arg_slots; 1839 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args); 1840 1841 // Compute framesize for the wrapper. We need to handlize all oops in 1842 // incoming registers 1843 1844 // Calculate the total number of stack slots we will need. 1845 1846 // First count the abi requirement plus all of the outgoing args 1847 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots; 1848 1849 // Now the space for the inbound oop handle area 1850 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers 1851 1852 int oop_handle_offset = stack_slots; 1853 stack_slots += total_save_slots; 1854 1855 // Now any space we need for handlizing a klass if static method 1856 1857 int klass_slot_offset = 0; 1858 int klass_offset = -1; 1859 int lock_slot_offset = 0; 1860 bool is_static = false; 1861 1862 if (method->is_static()) { 1863 klass_slot_offset = stack_slots; 1864 stack_slots += VMRegImpl::slots_per_word; 1865 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size; 1866 is_static = true; 1867 } 1868 1869 // Plus a lock if needed 1870 1871 if (method->is_synchronized()) { 1872 lock_slot_offset = stack_slots; 1873 stack_slots += VMRegImpl::slots_per_word; 1874 } 1875 1876 // Now a place (+2) to save return values or temp during shuffling 1877 // + 4 for return address (which we own) and saved rbp 1878 stack_slots += 6; 1879 1880 // Ok The space we have allocated will look like: 1881 // 1882 // 1883 // FP-> | | 1884 // |---------------------| 1885 // | 2 slots for moves | 1886 // |---------------------| 1887 // | lock box (if sync) | 1888 // |---------------------| <- lock_slot_offset 1889 // | klass (if static) | 1890 // |---------------------| <- klass_slot_offset 1891 // | oopHandle area | 1892 // |---------------------| <- oop_handle_offset (6 java arg registers) 1893 // | outbound memory | 1894 // | based arguments | 1895 // | | 1896 // |---------------------| 1897 // | | 1898 // SP-> | out_preserved_slots | 1899 // 1900 // 1901 1902 1903 // Now compute actual number of stack words we need rounding to make 1904 // stack properly aligned. 1905 stack_slots = align_up(stack_slots, StackAlignmentInSlots); 1906 1907 int stack_size = stack_slots * VMRegImpl::stack_slot_size; 1908 1909 // First thing make an ic check to see if we should even be here 1910 1911 // We are free to use all registers as temps without saving them and 1912 // restoring them except rbp. rbp is the only callee save register 1913 // as far as the interpreter and the compiler(s) are concerned. 1914 1915 const Register receiver = j_rarg0; 1916 1917 Label exception_pending; 1918 1919 assert_different_registers(receiver, rscratch1, rscratch2); 1920 __ verify_oop(receiver); 1921 __ ic_check(8 /* end_alignment */); 1922 1923 int vep_offset = ((intptr_t)__ pc()) - start; 1924 1925 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) { 1926 Label L_skip_barrier; 1927 Register klass = r10; 1928 __ mov_metadata(klass, method->method_holder()); // InstanceKlass* 1929 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1930 1931 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1932 1933 __ bind(L_skip_barrier); 1934 } 1935 1936 #ifdef COMPILER1 1937 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available. 1938 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) { 1939 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/); 1940 } 1941 #endif // COMPILER1 1942 1943 // The instruction at the verified entry point must be 5 bytes or longer 1944 // because it can be patched on the fly by make_non_entrant. The stack bang 1945 // instruction fits that requirement. 1946 1947 // Generate stack overflow check 1948 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size()); 1949 1950 // Generate a new frame for the wrapper. 1951 __ enter(); 1952 // -2 because return address is already present and so is saved rbp 1953 __ subptr(rsp, stack_size - 2*wordSize); 1954 1955 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1956 // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub 1957 bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */); 1958 1959 // Frame is now completed as far as size and linkage. 1960 int frame_complete = ((intptr_t)__ pc()) - start; 1961 1962 if (UseRTMLocking) { 1963 // Abort RTM transaction before calling JNI 1964 // because critical section will be large and will be 1965 // aborted anyway. Also nmethod could be deoptimized. 1966 __ xabort(0); 1967 } 1968 1969 #ifdef ASSERT 1970 __ check_stack_alignment(rsp, "improperly aligned stack"); 1971 #endif /* ASSERT */ 1972 1973 1974 // We use r14 as the oop handle for the receiver/klass 1975 // It is callee save so it survives the call to native 1976 1977 const Register oop_handle_reg = r14; 1978 1979 // 1980 // We immediately shuffle the arguments so that any vm call we have to 1981 // make from here on out (sync slow path, jvmti, etc.) we will have 1982 // captured the oops from our caller and have a valid oopMap for 1983 // them. 1984 1985 // ----------------- 1986 // The Grand Shuffle 1987 1988 // The Java calling convention is either equal (linux) or denser (win64) than the 1989 // c calling convention. However the because of the jni_env argument the c calling 1990 // convention always has at least one more (and two for static) arguments than Java. 1991 // Therefore if we move the args from java -> c backwards then we will never have 1992 // a register->register conflict and we don't have to build a dependency graph 1993 // and figure out how to break any cycles. 1994 // 1995 1996 // Record esp-based slot for receiver on stack for non-static methods 1997 int receiver_offset = -1; 1998 1999 // This is a trick. We double the stack slots so we can claim 2000 // the oops in the caller's frame. Since we are sure to have 2001 // more args than the caller doubling is enough to make 2002 // sure we can capture all the incoming oop args from the 2003 // caller. 2004 // 2005 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/); 2006 2007 // Mark location of rbp (someday) 2008 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp)); 2009 2010 // Use eax, ebx as temporaries during any memory-memory moves we have to do 2011 // All inbound args are referenced based on rbp and all outbound args via rsp. 2012 2013 2014 #ifdef ASSERT 2015 bool reg_destroyed[Register::number_of_registers]; 2016 bool freg_destroyed[XMMRegister::number_of_registers]; 2017 for ( int r = 0 ; r < Register::number_of_registers ; r++ ) { 2018 reg_destroyed[r] = false; 2019 } 2020 for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) { 2021 freg_destroyed[f] = false; 2022 } 2023 2024 #endif /* ASSERT */ 2025 2026 // For JNI natives the incoming and outgoing registers are offset upwards. 2027 GrowableArray<int> arg_order(2 * total_in_args); 2028 2029 VMRegPair tmp_vmreg; 2030 tmp_vmreg.set2(rbx->as_VMReg()); 2031 2032 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 2033 arg_order.push(i); 2034 arg_order.push(c_arg); 2035 } 2036 2037 int temploc = -1; 2038 for (int ai = 0; ai < arg_order.length(); ai += 2) { 2039 int i = arg_order.at(ai); 2040 int c_arg = arg_order.at(ai + 1); 2041 __ block_comment(err_msg("move %d -> %d", i, c_arg)); 2042 #ifdef ASSERT 2043 if (in_regs[i].first()->is_Register()) { 2044 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!"); 2045 } else if (in_regs[i].first()->is_XMMRegister()) { 2046 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!"); 2047 } 2048 if (out_regs[c_arg].first()->is_Register()) { 2049 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 2050 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 2051 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 2052 } 2053 #endif /* ASSERT */ 2054 switch (in_sig_bt[i]) { 2055 case T_ARRAY: 2056 case T_OBJECT: 2057 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg], 2058 ((i == 0) && (!is_static)), 2059 &receiver_offset); 2060 break; 2061 case T_VOID: 2062 break; 2063 2064 case T_FLOAT: 2065 __ float_move(in_regs[i], out_regs[c_arg]); 2066 break; 2067 2068 case T_DOUBLE: 2069 assert( i + 1 < total_in_args && 2070 in_sig_bt[i + 1] == T_VOID && 2071 out_sig_bt[c_arg+1] == T_VOID, "bad arg list"); 2072 __ double_move(in_regs[i], out_regs[c_arg]); 2073 break; 2074 2075 case T_LONG : 2076 __ long_move(in_regs[i], out_regs[c_arg]); 2077 break; 2078 2079 case T_ADDRESS: assert(false, "found T_ADDRESS in java args"); 2080 2081 default: 2082 __ move32_64(in_regs[i], out_regs[c_arg]); 2083 } 2084 } 2085 2086 int c_arg; 2087 2088 // Pre-load a static method's oop into r14. Used both by locking code and 2089 // the normal JNI call code. 2090 // point c_arg at the first arg that is already loaded in case we 2091 // need to spill before we call out 2092 c_arg = total_c_args - total_in_args; 2093 2094 if (method->is_static()) { 2095 2096 // load oop into a register 2097 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror())); 2098 2099 // Now handlize the static class mirror it's known not-null. 2100 __ movptr(Address(rsp, klass_offset), oop_handle_reg); 2101 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset)); 2102 2103 // Now get the handle 2104 __ lea(oop_handle_reg, Address(rsp, klass_offset)); 2105 // store the klass handle as second argument 2106 __ movptr(c_rarg1, oop_handle_reg); 2107 // and protect the arg if we must spill 2108 c_arg--; 2109 } 2110 2111 // Change state to native (we save the return address in the thread, since it might not 2112 // be pushed on the stack when we do a stack traversal). It is enough that the pc() 2113 // points into the right code segment. It does not have to be the correct return pc. 2114 // We use the same pc/oopMap repeatedly when we call out 2115 2116 intptr_t the_pc = (intptr_t) __ pc(); 2117 oop_maps->add_gc_map(the_pc - start, map); 2118 2119 __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1); 2120 2121 2122 // We have all of the arguments setup at this point. We must not touch any register 2123 // argument registers at this point (what if we save/restore them there are no oop? 2124 2125 { 2126 SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1); 2127 // protect the args we've loaded 2128 save_args(masm, total_c_args, c_arg, out_regs); 2129 __ mov_metadata(c_rarg1, method()); 2130 __ call_VM_leaf( 2131 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry), 2132 r15_thread, c_rarg1); 2133 restore_args(masm, total_c_args, c_arg, out_regs); 2134 } 2135 2136 // RedefineClasses() tracing support for obsolete method entry 2137 if (log_is_enabled(Trace, redefine, class, obsolete)) { 2138 // protect the args we've loaded 2139 save_args(masm, total_c_args, c_arg, out_regs); 2140 __ mov_metadata(c_rarg1, method()); 2141 __ call_VM_leaf( 2142 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry), 2143 r15_thread, c_rarg1); 2144 restore_args(masm, total_c_args, c_arg, out_regs); 2145 } 2146 2147 // Lock a synchronized method 2148 2149 // Register definitions used by locking and unlocking 2150 2151 const Register swap_reg = rax; // Must use rax for cmpxchg instruction 2152 const Register obj_reg = rbx; // Will contain the oop 2153 const Register lock_reg = r13; // Address of compiler lock object (BasicLock) 2154 const Register old_hdr = r13; // value of old header at unlock time 2155 2156 Label slow_path_lock; 2157 Label lock_done; 2158 2159 if (method->is_synchronized()) { 2160 Label count_mon; 2161 2162 const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes(); 2163 2164 // Get the handle (the 2nd argument) 2165 __ mov(oop_handle_reg, c_rarg1); 2166 2167 // Get address of the box 2168 2169 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2170 2171 // Load the oop from the handle 2172 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2173 2174 if (LockingMode == LM_MONITOR) { 2175 __ jmp(slow_path_lock); 2176 } else if (LockingMode == LM_LEGACY) { 2177 // Load immediate 1 into swap_reg %rax 2178 __ movl(swap_reg, 1); 2179 2180 // Load (object->mark() | 1) into swap_reg %rax 2181 __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2182 2183 // Save (object->mark() | 1) into BasicLock's displaced header 2184 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2185 2186 // src -> dest iff dest == rax else rax <- dest 2187 __ lock(); 2188 __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2189 __ jcc(Assembler::equal, count_mon); 2190 2191 // Hmm should this move to the slow path code area??? 2192 2193 // Test if the oopMark is an obvious stack pointer, i.e., 2194 // 1) (mark & 3) == 0, and 2195 // 2) rsp <= mark < mark + os::pagesize() 2196 // These 3 tests can be done by evaluating the following 2197 // expression: ((mark - rsp) & (3 - os::vm_page_size())), 2198 // assuming both stack pointer and pagesize have their 2199 // least significant 2 bits clear. 2200 // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg 2201 2202 __ subptr(swap_reg, rsp); 2203 __ andptr(swap_reg, 3 - (int)os::vm_page_size()); 2204 2205 // Save the test result, for recursive case, the result is zero 2206 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2207 __ jcc(Assembler::notEqual, slow_path_lock); 2208 } else { 2209 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2210 __ lightweight_lock(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock); 2211 } 2212 __ bind(count_mon); 2213 __ inc_held_monitor_count(); 2214 2215 // Slow path will re-enter here 2216 __ bind(lock_done); 2217 } 2218 2219 // Finally just about ready to make the JNI call 2220 2221 // get JNIEnv* which is first argument to native 2222 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset()))); 2223 2224 // Now set thread in native 2225 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 2226 2227 __ call(RuntimeAddress(native_func)); 2228 2229 // Verify or restore cpu control state after JNI call 2230 __ restore_cpu_control_state_after_jni(rscratch1); 2231 2232 // Unpack native results. 2233 switch (ret_type) { 2234 case T_BOOLEAN: __ c2bool(rax); break; 2235 case T_CHAR : __ movzwl(rax, rax); break; 2236 case T_BYTE : __ sign_extend_byte (rax); break; 2237 case T_SHORT : __ sign_extend_short(rax); break; 2238 case T_INT : /* nothing to do */ break; 2239 case T_DOUBLE : 2240 case T_FLOAT : 2241 // Result is in xmm0 we'll save as needed 2242 break; 2243 case T_ARRAY: // Really a handle 2244 case T_OBJECT: // Really a handle 2245 break; // can't de-handlize until after safepoint check 2246 case T_VOID: break; 2247 case T_LONG: break; 2248 default : ShouldNotReachHere(); 2249 } 2250 2251 Label after_transition; 2252 2253 // Switch thread to "native transition" state before reading the synchronization state. 2254 // This additional state is necessary because reading and testing the synchronization 2255 // state is not atomic w.r.t. GC, as this scenario demonstrates: 2256 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted. 2257 // VM thread changes sync state to synchronizing and suspends threads for GC. 2258 // Thread A is resumed to finish this native method, but doesn't block here since it 2259 // didn't see any synchronization is progress, and escapes. 2260 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 2261 2262 // Force this write out before the read below 2263 if (!UseSystemMemoryBarrier) { 2264 __ membar(Assembler::Membar_mask_bits( 2265 Assembler::LoadLoad | Assembler::LoadStore | 2266 Assembler::StoreLoad | Assembler::StoreStore)); 2267 } 2268 2269 // check for safepoint operation in progress and/or pending suspend requests 2270 { 2271 Label Continue; 2272 Label slow_path; 2273 2274 __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */); 2275 2276 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2277 __ jcc(Assembler::equal, Continue); 2278 __ bind(slow_path); 2279 2280 // Don't use call_VM as it will see a possible pending exception and forward it 2281 // and never return here preventing us from clearing _last_native_pc down below. 2282 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are 2283 // preserved and correspond to the bcp/locals pointers. So we do a runtime call 2284 // by hand. 2285 // 2286 __ vzeroupper(); 2287 save_native_result(masm, ret_type, stack_slots); 2288 __ mov(c_rarg0, r15_thread); 2289 __ mov(r12, rsp); // remember sp 2290 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2291 __ andptr(rsp, -16); // align stack as required by ABI 2292 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 2293 __ mov(rsp, r12); // restore sp 2294 __ reinit_heapbase(); 2295 // Restore any method result value 2296 restore_native_result(masm, ret_type, stack_slots); 2297 __ bind(Continue); 2298 } 2299 2300 // change thread state 2301 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 2302 __ bind(after_transition); 2303 2304 Label reguard; 2305 Label reguard_done; 2306 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 2307 __ jcc(Assembler::equal, reguard); 2308 __ bind(reguard_done); 2309 2310 // native result if any is live 2311 2312 // Unlock 2313 Label slow_path_unlock; 2314 Label unlock_done; 2315 if (method->is_synchronized()) { 2316 2317 Label fast_done; 2318 2319 // Get locked oop from the handle we passed to jni 2320 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2321 2322 if (LockingMode == LM_LEGACY) { 2323 Label not_recur; 2324 // Simple recursive lock? 2325 __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD); 2326 __ jcc(Assembler::notEqual, not_recur); 2327 __ dec_held_monitor_count(); 2328 __ jmpb(fast_done); 2329 __ bind(not_recur); 2330 } 2331 2332 // Must save rax if it is live now because cmpxchg must use it 2333 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2334 save_native_result(masm, ret_type, stack_slots); 2335 } 2336 2337 if (LockingMode == LM_MONITOR) { 2338 __ jmp(slow_path_unlock); 2339 } else if (LockingMode == LM_LEGACY) { 2340 // get address of the stack lock 2341 __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2342 // get old displaced header 2343 __ movptr(old_hdr, Address(rax, 0)); 2344 2345 // Atomic swap old header if oop still contains the stack lock 2346 __ lock(); 2347 __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2348 __ jcc(Assembler::notEqual, slow_path_unlock); 2349 __ dec_held_monitor_count(); 2350 } else { 2351 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2352 __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock); 2353 __ dec_held_monitor_count(); 2354 } 2355 2356 // slow path re-enters here 2357 __ bind(unlock_done); 2358 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2359 restore_native_result(masm, ret_type, stack_slots); 2360 } 2361 2362 __ bind(fast_done); 2363 } 2364 { 2365 SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1); 2366 save_native_result(masm, ret_type, stack_slots); 2367 __ mov_metadata(c_rarg1, method()); 2368 __ call_VM_leaf( 2369 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit), 2370 r15_thread, c_rarg1); 2371 restore_native_result(masm, ret_type, stack_slots); 2372 } 2373 2374 __ reset_last_Java_frame(false); 2375 2376 // Unbox oop result, e.g. JNIHandles::resolve value. 2377 if (is_reference_type(ret_type)) { 2378 __ resolve_jobject(rax /* value */, 2379 r15_thread /* thread */, 2380 rcx /* tmp */); 2381 } 2382 2383 if (CheckJNICalls) { 2384 // clear_pending_jni_exception_check 2385 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD); 2386 } 2387 2388 // reset handle block 2389 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset())); 2390 __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD); 2391 2392 // pop our frame 2393 2394 __ leave(); 2395 2396 // Any exception pending? 2397 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2398 __ jcc(Assembler::notEqual, exception_pending); 2399 2400 // Return 2401 2402 __ ret(0); 2403 2404 // Unexpected paths are out of line and go here 2405 2406 // forward the exception 2407 __ bind(exception_pending); 2408 2409 // and forward the exception 2410 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 2411 2412 // Slow path locking & unlocking 2413 if (method->is_synchronized()) { 2414 2415 // BEGIN Slow path lock 2416 __ bind(slow_path_lock); 2417 2418 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM 2419 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2420 2421 // protect the args we've loaded 2422 save_args(masm, total_c_args, c_arg, out_regs); 2423 2424 __ mov(c_rarg0, obj_reg); 2425 __ mov(c_rarg1, lock_reg); 2426 __ mov(c_rarg2, r15_thread); 2427 2428 // Not a leaf but we have last_Java_frame setup as we want 2429 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3); 2430 restore_args(masm, total_c_args, c_arg, out_regs); 2431 2432 #ifdef ASSERT 2433 { Label L; 2434 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2435 __ jcc(Assembler::equal, L); 2436 __ stop("no pending exception allowed on exit from monitorenter"); 2437 __ bind(L); 2438 } 2439 #endif 2440 __ jmp(lock_done); 2441 2442 // END Slow path lock 2443 2444 // BEGIN Slow path unlock 2445 __ bind(slow_path_unlock); 2446 2447 // If we haven't already saved the native result we must save it now as xmm registers 2448 // are still exposed. 2449 __ vzeroupper(); 2450 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2451 save_native_result(masm, ret_type, stack_slots); 2452 } 2453 2454 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2455 2456 __ mov(c_rarg0, obj_reg); 2457 __ mov(c_rarg2, r15_thread); 2458 __ mov(r12, rsp); // remember sp 2459 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2460 __ andptr(rsp, -16); // align stack as required by ABI 2461 2462 // Save pending exception around call to VM (which contains an EXCEPTION_MARK) 2463 // NOTE that obj_reg == rbx currently 2464 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset()))); 2465 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2466 2467 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2468 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C))); 2469 __ mov(rsp, r12); // restore sp 2470 __ reinit_heapbase(); 2471 #ifdef ASSERT 2472 { 2473 Label L; 2474 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2475 __ jcc(Assembler::equal, L); 2476 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C"); 2477 __ bind(L); 2478 } 2479 #endif /* ASSERT */ 2480 2481 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx); 2482 2483 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2484 restore_native_result(masm, ret_type, stack_slots); 2485 } 2486 __ jmp(unlock_done); 2487 2488 // END Slow path unlock 2489 2490 } // synchronized 2491 2492 // SLOW PATH Reguard the stack if needed 2493 2494 __ bind(reguard); 2495 __ vzeroupper(); 2496 save_native_result(masm, ret_type, stack_slots); 2497 __ mov(r12, rsp); // remember sp 2498 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2499 __ andptr(rsp, -16); // align stack as required by ABI 2500 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 2501 __ mov(rsp, r12); // restore sp 2502 __ reinit_heapbase(); 2503 restore_native_result(masm, ret_type, stack_slots); 2504 // and continue 2505 __ jmp(reguard_done); 2506 2507 2508 2509 __ flush(); 2510 2511 nmethod *nm = nmethod::new_native_nmethod(method, 2512 compile_id, 2513 masm->code(), 2514 vep_offset, 2515 frame_complete, 2516 stack_slots / VMRegImpl::slots_per_word, 2517 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)), 2518 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size), 2519 oop_maps); 2520 2521 return nm; 2522 } 2523 2524 // this function returns the adjust size (in number of words) to a c2i adapter 2525 // activation for use during deoptimization 2526 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) { 2527 return (callee_locals - callee_parameters) * Interpreter::stackElementWords; 2528 } 2529 2530 2531 uint SharedRuntime::out_preserve_stack_slots() { 2532 return 0; 2533 } 2534 2535 2536 // Number of stack slots between incoming argument block and the start of 2537 // a new frame. The PROLOG must add this many slots to the stack. The 2538 // EPILOG must remove this many slots. amd64 needs two slots for 2539 // return address. 2540 uint SharedRuntime::in_preserve_stack_slots() { 2541 return 4 + 2 * VerifyStackAtCalls; 2542 } 2543 2544 //------------------------------generate_deopt_blob---------------------------- 2545 void SharedRuntime::generate_deopt_blob() { 2546 // Allocate space for the code 2547 ResourceMark rm; 2548 // Setup code generation tools 2549 int pad = 0; 2550 if (UseAVX > 2) { 2551 pad += 1024; 2552 } 2553 #if INCLUDE_JVMCI 2554 if (EnableJVMCI) { 2555 pad += 512; // Increase the buffer size when compiling for JVMCI 2556 } 2557 #endif 2558 CodeBuffer buffer("deopt_blob", 2560+pad, 1024); 2559 MacroAssembler* masm = new MacroAssembler(&buffer); 2560 int frame_size_in_words; 2561 OopMap* map = nullptr; 2562 OopMapSet *oop_maps = new OopMapSet(); 2563 2564 // ------------- 2565 // This code enters when returning to a de-optimized nmethod. A return 2566 // address has been pushed on the stack, and return values are in 2567 // registers. 2568 // If we are doing a normal deopt then we were called from the patched 2569 // nmethod from the point we returned to the nmethod. So the return 2570 // address on the stack is wrong by NativeCall::instruction_size 2571 // We will adjust the value so it looks like we have the original return 2572 // address on the stack (like when we eagerly deoptimized). 2573 // In the case of an exception pending when deoptimizing, we enter 2574 // with a return address on the stack that points after the call we patched 2575 // into the exception handler. We have the following register state from, 2576 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp). 2577 // rax: exception oop 2578 // rbx: exception handler 2579 // rdx: throwing pc 2580 // So in this case we simply jam rdx into the useless return address and 2581 // the stack looks just like we want. 2582 // 2583 // At this point we need to de-opt. We save the argument return 2584 // registers. We call the first C routine, fetch_unroll_info(). This 2585 // routine captures the return values and returns a structure which 2586 // describes the current frame size and the sizes of all replacement frames. 2587 // The current frame is compiled code and may contain many inlined 2588 // functions, each with their own JVM state. We pop the current frame, then 2589 // push all the new frames. Then we call the C routine unpack_frames() to 2590 // populate these frames. Finally unpack_frames() returns us the new target 2591 // address. Notice that callee-save registers are BLOWN here; they have 2592 // already been captured in the vframeArray at the time the return PC was 2593 // patched. 2594 address start = __ pc(); 2595 Label cont; 2596 2597 // Prolog for non exception case! 2598 2599 // Save everything in sight. 2600 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2601 2602 // Normal deoptimization. Save exec mode for unpack_frames. 2603 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved 2604 __ jmp(cont); 2605 2606 int reexecute_offset = __ pc() - start; 2607 #if INCLUDE_JVMCI && !defined(COMPILER1) 2608 if (EnableJVMCI && UseJVMCICompiler) { 2609 // JVMCI does not use this kind of deoptimization 2610 __ should_not_reach_here(); 2611 } 2612 #endif 2613 2614 // Reexecute case 2615 // return address is the pc describes what bci to do re-execute at 2616 2617 // No need to update map as each call to save_live_registers will produce identical oopmap 2618 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2619 2620 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved 2621 __ jmp(cont); 2622 2623 #if INCLUDE_JVMCI 2624 Label after_fetch_unroll_info_call; 2625 int implicit_exception_uncommon_trap_offset = 0; 2626 int uncommon_trap_offset = 0; 2627 2628 if (EnableJVMCI) { 2629 implicit_exception_uncommon_trap_offset = __ pc() - start; 2630 2631 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset()))); 2632 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD); 2633 2634 uncommon_trap_offset = __ pc() - start; 2635 2636 // Save everything in sight. 2637 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2638 // fetch_unroll_info needs to call last_java_frame() 2639 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2640 2641 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset()))); 2642 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1); 2643 2644 __ movl(r14, Deoptimization::Unpack_reexecute); 2645 __ mov(c_rarg0, r15_thread); 2646 __ movl(c_rarg2, r14); // exec mode 2647 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2648 oop_maps->add_gc_map( __ pc()-start, map->deep_copy()); 2649 2650 __ reset_last_Java_frame(false); 2651 2652 __ jmp(after_fetch_unroll_info_call); 2653 } // EnableJVMCI 2654 #endif // INCLUDE_JVMCI 2655 2656 int exception_offset = __ pc() - start; 2657 2658 // Prolog for exception case 2659 2660 // all registers are dead at this entry point, except for rax, and 2661 // rdx which contain the exception oop and exception pc 2662 // respectively. Set them in TLS and fall thru to the 2663 // unpack_with_exception_in_tls entry point. 2664 2665 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 2666 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax); 2667 2668 int exception_in_tls_offset = __ pc() - start; 2669 2670 // new implementation because exception oop is now passed in JavaThread 2671 2672 // Prolog for exception case 2673 // All registers must be preserved because they might be used by LinearScan 2674 // Exceptiop oop and throwing PC are passed in JavaThread 2675 // tos: stack at point of call to method that threw the exception (i.e. only 2676 // args are on the stack, no return address) 2677 2678 // make room on stack for the return address 2679 // It will be patched later with the throwing pc. The correct value is not 2680 // available now because loading it from memory would destroy registers. 2681 __ push(0); 2682 2683 // Save everything in sight. 2684 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2685 2686 // Now it is safe to overwrite any register 2687 2688 // Deopt during an exception. Save exec mode for unpack_frames. 2689 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved 2690 2691 // load throwing pc from JavaThread and patch it as the return address 2692 // of the current frame. Then clear the field in JavaThread 2693 2694 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2695 __ movptr(Address(rbp, wordSize), rdx); 2696 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2697 2698 #ifdef ASSERT 2699 // verify that there is really an exception oop in JavaThread 2700 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2701 __ verify_oop(rax); 2702 2703 // verify that there is no pending exception 2704 Label no_pending_exception; 2705 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 2706 __ testptr(rax, rax); 2707 __ jcc(Assembler::zero, no_pending_exception); 2708 __ stop("must not have pending exception here"); 2709 __ bind(no_pending_exception); 2710 #endif 2711 2712 __ bind(cont); 2713 2714 // Call C code. Need thread and this frame, but NOT official VM entry 2715 // crud. We cannot block on this call, no GC can happen. 2716 // 2717 // UnrollBlock* fetch_unroll_info(JavaThread* thread) 2718 2719 // fetch_unroll_info needs to call last_java_frame(). 2720 2721 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2722 #ifdef ASSERT 2723 { Label L; 2724 __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD); 2725 __ jcc(Assembler::equal, L); 2726 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared"); 2727 __ bind(L); 2728 } 2729 #endif // ASSERT 2730 __ mov(c_rarg0, r15_thread); 2731 __ movl(c_rarg1, r14); // exec_mode 2732 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info))); 2733 2734 // Need to have an oopmap that tells fetch_unroll_info where to 2735 // find any register it might need. 2736 oop_maps->add_gc_map(__ pc() - start, map); 2737 2738 __ reset_last_Java_frame(false); 2739 2740 #if INCLUDE_JVMCI 2741 if (EnableJVMCI) { 2742 __ bind(after_fetch_unroll_info_call); 2743 } 2744 #endif 2745 2746 // Load UnrollBlock* into rdi 2747 __ mov(rdi, rax); 2748 2749 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset())); 2750 Label noException; 2751 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending? 2752 __ jcc(Assembler::notEqual, noException); 2753 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2754 // QQQ this is useless it was null above 2755 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2756 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD); 2757 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2758 2759 __ verify_oop(rax); 2760 2761 // Overwrite the result registers with the exception results. 2762 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2763 // I think this is useless 2764 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx); 2765 2766 __ bind(noException); 2767 2768 // Only register save data is on the stack. 2769 // Now restore the result registers. Everything else is either dead 2770 // or captured in the vframeArray. 2771 RegisterSaver::restore_result_registers(masm); 2772 2773 // All of the register save area has been popped of the stack. Only the 2774 // return address remains. 2775 2776 // Pop all the frames we must move/replace. 2777 // 2778 // Frame picture (youngest to oldest) 2779 // 1: self-frame (no frame link) 2780 // 2: deopting frame (no frame link) 2781 // 3: caller of deopting frame (could be compiled/interpreted). 2782 // 2783 // Note: by leaving the return address of self-frame on the stack 2784 // and using the size of frame 2 to adjust the stack 2785 // when we are done the return to frame 3 will still be on the stack. 2786 2787 // Pop deoptimized frame 2788 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset())); 2789 __ addptr(rsp, rcx); 2790 2791 // rsp should be pointing at the return address to the caller (3) 2792 2793 // Pick up the initial fp we should save 2794 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2795 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset())); 2796 2797 #ifdef ASSERT 2798 // Compilers generate code that bang the stack by as much as the 2799 // interpreter would need. So this stack banging should never 2800 // trigger a fault. Verify that it does not on non product builds. 2801 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset())); 2802 __ bang_stack_size(rbx, rcx); 2803 #endif 2804 2805 // Load address of array of frame pcs into rcx 2806 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset())); 2807 2808 // Trash the old pc 2809 __ addptr(rsp, wordSize); 2810 2811 // Load address of array of frame sizes into rsi 2812 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset())); 2813 2814 // Load counter into rdx 2815 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset())); 2816 2817 // Now adjust the caller's stack to make up for the extra locals 2818 // but record the original sp so that we can save it in the skeletal interpreter 2819 // frame and the stack walking of interpreter_sender will get the unextended sp 2820 // value and not the "real" sp value. 2821 2822 const Register sender_sp = r8; 2823 2824 __ mov(sender_sp, rsp); 2825 __ movl(rbx, Address(rdi, 2826 Deoptimization::UnrollBlock:: 2827 caller_adjustment_offset())); 2828 __ subptr(rsp, rbx); 2829 2830 // Push interpreter frames in a loop 2831 Label loop; 2832 __ bind(loop); 2833 __ movptr(rbx, Address(rsi, 0)); // Load frame size 2834 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand 2835 __ pushptr(Address(rcx, 0)); // Save return address 2836 __ enter(); // Save old & set new ebp 2837 __ subptr(rsp, rbx); // Prolog 2838 // This value is corrected by layout_activation_impl 2839 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD); 2840 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable 2841 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 2842 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 2843 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 2844 __ decrementl(rdx); // Decrement counter 2845 __ jcc(Assembler::notZero, loop); 2846 __ pushptr(Address(rcx, 0)); // Save final return address 2847 2848 // Re-push self-frame 2849 __ enter(); // Save old & set new ebp 2850 2851 // Allocate a full sized register save area. 2852 // Return address and rbp are in place, so we allocate two less words. 2853 __ subptr(rsp, (frame_size_in_words - 2) * wordSize); 2854 2855 // Restore frame locals after moving the frame 2856 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0); 2857 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2858 2859 // Call C code. Need thread but NOT official VM entry 2860 // crud. We cannot block on this call, no GC can happen. Call should 2861 // restore return values to their stack-slots with the new SP. 2862 // 2863 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode) 2864 2865 // Use rbp because the frames look interpreted now 2866 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 2867 // Don't need the precise return PC here, just precise enough to point into this code blob. 2868 address the_pc = __ pc(); 2869 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1); 2870 2871 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI 2872 __ mov(c_rarg0, r15_thread); 2873 __ movl(c_rarg1, r14); // second arg: exec_mode 2874 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 2875 // Revert SP alignment after call since we're going to do some SP relative addressing below 2876 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset())); 2877 2878 // Set an oopmap for the call site 2879 // Use the same PC we used for the last java frame 2880 oop_maps->add_gc_map(the_pc - start, 2881 new OopMap( frame_size_in_words, 0 )); 2882 2883 // Clear fp AND pc 2884 __ reset_last_Java_frame(true); 2885 2886 // Collect return values 2887 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes())); 2888 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes())); 2889 // I think this is useless (throwing pc?) 2890 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes())); 2891 2892 // Pop self-frame. 2893 __ leave(); // Epilog 2894 2895 // Jump to interpreter 2896 __ ret(0); 2897 2898 // Make sure all code is generated 2899 masm->flush(); 2900 2901 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words); 2902 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset); 2903 #if INCLUDE_JVMCI 2904 if (EnableJVMCI) { 2905 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset); 2906 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset); 2907 } 2908 #endif 2909 } 2910 2911 #ifdef COMPILER2 2912 //------------------------------generate_uncommon_trap_blob-------------------- 2913 void SharedRuntime::generate_uncommon_trap_blob() { 2914 // Allocate space for the code 2915 ResourceMark rm; 2916 // Setup code generation tools 2917 CodeBuffer buffer("uncommon_trap_blob", 2048, 1024); 2918 MacroAssembler* masm = new MacroAssembler(&buffer); 2919 2920 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 2921 2922 address start = __ pc(); 2923 2924 if (UseRTMLocking) { 2925 // Abort RTM transaction before possible nmethod deoptimization. 2926 __ xabort(0); 2927 } 2928 2929 // Push self-frame. We get here with a return address on the 2930 // stack, so rsp is 8-byte aligned until we allocate our frame. 2931 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog! 2932 2933 // No callee saved registers. rbp is assumed implicitly saved 2934 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 2935 2936 // compiler left unloaded_class_index in j_rarg0 move to where the 2937 // runtime expects it. 2938 __ movl(c_rarg1, j_rarg0); 2939 2940 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2941 2942 // Call C code. Need thread but NOT official VM entry 2943 // crud. We cannot block on this call, no GC can happen. Call should 2944 // capture callee-saved registers as well as return values. 2945 // Thread is in rdi already. 2946 // 2947 // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index); 2948 2949 __ mov(c_rarg0, r15_thread); 2950 __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap); 2951 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2952 2953 // Set an oopmap for the call site 2954 OopMapSet* oop_maps = new OopMapSet(); 2955 OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0); 2956 2957 // location of rbp is known implicitly by the frame sender code 2958 2959 oop_maps->add_gc_map(__ pc() - start, map); 2960 2961 __ reset_last_Java_frame(false); 2962 2963 // Load UnrollBlock* into rdi 2964 __ mov(rdi, rax); 2965 2966 #ifdef ASSERT 2967 { Label L; 2968 __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()), 2969 Deoptimization::Unpack_uncommon_trap); 2970 __ jcc(Assembler::equal, L); 2971 __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap"); 2972 __ bind(L); 2973 } 2974 #endif 2975 2976 // Pop all the frames we must move/replace. 2977 // 2978 // Frame picture (youngest to oldest) 2979 // 1: self-frame (no frame link) 2980 // 2: deopting frame (no frame link) 2981 // 3: caller of deopting frame (could be compiled/interpreted). 2982 2983 // Pop self-frame. We have no frame, and must rely only on rax and rsp. 2984 __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog! 2985 2986 // Pop deoptimized frame (int) 2987 __ movl(rcx, Address(rdi, 2988 Deoptimization::UnrollBlock:: 2989 size_of_deoptimized_frame_offset())); 2990 __ addptr(rsp, rcx); 2991 2992 // rsp should be pointing at the return address to the caller (3) 2993 2994 // Pick up the initial fp we should save 2995 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2996 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset())); 2997 2998 #ifdef ASSERT 2999 // Compilers generate code that bang the stack by as much as the 3000 // interpreter would need. So this stack banging should never 3001 // trigger a fault. Verify that it does not on non product builds. 3002 __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset())); 3003 __ bang_stack_size(rbx, rcx); 3004 #endif 3005 3006 // Load address of array of frame pcs into rcx (address*) 3007 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset())); 3008 3009 // Trash the return pc 3010 __ addptr(rsp, wordSize); 3011 3012 // Load address of array of frame sizes into rsi (intptr_t*) 3013 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset())); 3014 3015 // Counter 3016 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset())); // (int) 3017 3018 // Now adjust the caller's stack to make up for the extra locals but 3019 // record the original sp so that we can save it in the skeletal 3020 // interpreter frame and the stack walking of interpreter_sender 3021 // will get the unextended sp value and not the "real" sp value. 3022 3023 const Register sender_sp = r8; 3024 3025 __ mov(sender_sp, rsp); 3026 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset())); // (int) 3027 __ subptr(rsp, rbx); 3028 3029 // Push interpreter frames in a loop 3030 Label loop; 3031 __ bind(loop); 3032 __ movptr(rbx, Address(rsi, 0)); // Load frame size 3033 __ subptr(rbx, 2 * wordSize); // We'll push pc and rbp by hand 3034 __ pushptr(Address(rcx, 0)); // Save return address 3035 __ enter(); // Save old & set new rbp 3036 __ subptr(rsp, rbx); // Prolog 3037 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), 3038 sender_sp); // Make it walkable 3039 // This value is corrected by layout_activation_impl 3040 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD); 3041 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 3042 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 3043 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 3044 __ decrementl(rdx); // Decrement counter 3045 __ jcc(Assembler::notZero, loop); 3046 __ pushptr(Address(rcx, 0)); // Save final return address 3047 3048 // Re-push self-frame 3049 __ enter(); // Save old & set new rbp 3050 __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt); 3051 // Prolog 3052 3053 // Use rbp because the frames look interpreted now 3054 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 3055 // Don't need the precise return PC here, just precise enough to point into this code blob. 3056 address the_pc = __ pc(); 3057 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1); 3058 3059 // Call C code. Need thread but NOT official VM entry 3060 // crud. We cannot block on this call, no GC can happen. Call should 3061 // restore return values to their stack-slots with the new SP. 3062 // Thread is in rdi already. 3063 // 3064 // BasicType unpack_frames(JavaThread* thread, int exec_mode); 3065 3066 __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI 3067 __ mov(c_rarg0, r15_thread); 3068 __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap); 3069 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 3070 3071 // Set an oopmap for the call site 3072 // Use the same PC we used for the last java frame 3073 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 3074 3075 // Clear fp AND pc 3076 __ reset_last_Java_frame(true); 3077 3078 // Pop self-frame. 3079 __ leave(); // Epilog 3080 3081 // Jump to interpreter 3082 __ ret(0); 3083 3084 // Make sure all code is generated 3085 masm->flush(); 3086 3087 _uncommon_trap_blob = UncommonTrapBlob::create(&buffer, oop_maps, 3088 SimpleRuntimeFrame::framesize >> 1); 3089 } 3090 #endif // COMPILER2 3091 3092 //------------------------------generate_handler_blob------ 3093 // 3094 // Generate a special Compile2Runtime blob that saves all registers, 3095 // and setup oopmap. 3096 // 3097 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) { 3098 assert(StubRoutines::forward_exception_entry() != nullptr, 3099 "must be generated before"); 3100 3101 ResourceMark rm; 3102 OopMapSet *oop_maps = new OopMapSet(); 3103 OopMap* map; 3104 3105 // Allocate space for the code. Setup code generation tools. 3106 CodeBuffer buffer("handler_blob", 2048, 1024); 3107 MacroAssembler* masm = new MacroAssembler(&buffer); 3108 3109 address start = __ pc(); 3110 address call_pc = nullptr; 3111 int frame_size_in_words; 3112 bool cause_return = (poll_type == POLL_AT_RETURN); 3113 bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP); 3114 3115 if (UseRTMLocking) { 3116 // Abort RTM transaction before calling runtime 3117 // because critical section will be large and will be 3118 // aborted anyway. Also nmethod could be deoptimized. 3119 __ xabort(0); 3120 } 3121 3122 // Make room for return address (or push it again) 3123 if (!cause_return) { 3124 __ push(rbx); 3125 } 3126 3127 // Save registers, fpu state, and flags 3128 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors); 3129 3130 // The following is basically a call_VM. However, we need the precise 3131 // address of the call in order to generate an oopmap. Hence, we do all the 3132 // work ourselves. 3133 3134 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next: 3135 3136 // The return address must always be correct so that frame constructor never 3137 // sees an invalid pc. 3138 3139 if (!cause_return) { 3140 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack. 3141 // Additionally, rbx is a callee saved register and we can look at it later to determine 3142 // if someone changed the return address for us! 3143 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset())); 3144 __ movptr(Address(rbp, wordSize), rbx); 3145 } 3146 3147 // Do the call 3148 __ mov(c_rarg0, r15_thread); 3149 __ call(RuntimeAddress(call_ptr)); 3150 3151 // Set an oopmap for the call site. This oopmap will map all 3152 // oop-registers and debug-info registers as callee-saved. This 3153 // will allow deoptimization at this safepoint to find all possible 3154 // debug-info recordings, as well as let GC find all oops. 3155 3156 oop_maps->add_gc_map( __ pc() - start, map); 3157 3158 Label noException; 3159 3160 __ reset_last_Java_frame(false); 3161 3162 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3163 __ jcc(Assembler::equal, noException); 3164 3165 // Exception pending 3166 3167 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3168 3169 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3170 3171 // No exception case 3172 __ bind(noException); 3173 3174 Label no_adjust; 3175 #ifdef ASSERT 3176 Label bail; 3177 #endif 3178 if (!cause_return) { 3179 Label no_prefix, not_special; 3180 3181 // If our stashed return pc was modified by the runtime we avoid touching it 3182 __ cmpptr(rbx, Address(rbp, wordSize)); 3183 __ jccb(Assembler::notEqual, no_adjust); 3184 3185 // Skip over the poll instruction. 3186 // See NativeInstruction::is_safepoint_poll() 3187 // Possible encodings: 3188 // 85 00 test %eax,(%rax) 3189 // 85 01 test %eax,(%rcx) 3190 // 85 02 test %eax,(%rdx) 3191 // 85 03 test %eax,(%rbx) 3192 // 85 06 test %eax,(%rsi) 3193 // 85 07 test %eax,(%rdi) 3194 // 3195 // 41 85 00 test %eax,(%r8) 3196 // 41 85 01 test %eax,(%r9) 3197 // 41 85 02 test %eax,(%r10) 3198 // 41 85 03 test %eax,(%r11) 3199 // 41 85 06 test %eax,(%r14) 3200 // 41 85 07 test %eax,(%r15) 3201 // 3202 // 85 04 24 test %eax,(%rsp) 3203 // 41 85 04 24 test %eax,(%r12) 3204 // 85 45 00 test %eax,0x0(%rbp) 3205 // 41 85 45 00 test %eax,0x0(%r13) 3206 3207 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix); 3208 __ jcc(Assembler::notEqual, no_prefix); 3209 __ addptr(rbx, 1); 3210 __ bind(no_prefix); 3211 #ifdef ASSERT 3212 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below 3213 #endif 3214 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values: 3215 // r12/rsp 0x04 3216 // r13/rbp 0x05 3217 __ movzbq(rcx, Address(rbx, 1)); 3218 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05 3219 __ subptr(rcx, 4); // looking for 0x00 .. 0x01 3220 __ cmpptr(rcx, 1); 3221 __ jcc(Assembler::above, not_special); 3222 __ addptr(rbx, 1); 3223 __ bind(not_special); 3224 #ifdef ASSERT 3225 // Verify the correct encoding of the poll we're about to skip. 3226 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl); 3227 __ jcc(Assembler::notEqual, bail); 3228 // Mask out the modrm bits 3229 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask); 3230 // rax encodes to 0, so if the bits are nonzero it's incorrect 3231 __ jcc(Assembler::notZero, bail); 3232 #endif 3233 // Adjust return pc forward to step over the safepoint poll instruction 3234 __ addptr(rbx, 2); 3235 __ movptr(Address(rbp, wordSize), rbx); 3236 } 3237 3238 __ bind(no_adjust); 3239 // Normal exit, restore registers and exit. 3240 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3241 __ ret(0); 3242 3243 #ifdef ASSERT 3244 __ bind(bail); 3245 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected"); 3246 #endif 3247 3248 // Make sure all code is generated 3249 masm->flush(); 3250 3251 // Fill-out other meta info 3252 return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words); 3253 } 3254 3255 // 3256 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss 3257 // 3258 // Generate a stub that calls into vm to find out the proper destination 3259 // of a java call. All the argument registers are live at this point 3260 // but since this is generic code we don't know what they are and the caller 3261 // must do any gc of the args. 3262 // 3263 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) { 3264 assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before"); 3265 3266 // allocate space for the code 3267 ResourceMark rm; 3268 3269 CodeBuffer buffer(name, 1200, 512); 3270 MacroAssembler* masm = new MacroAssembler(&buffer); 3271 3272 int frame_size_in_words; 3273 3274 OopMapSet *oop_maps = new OopMapSet(); 3275 OopMap* map = nullptr; 3276 3277 int start = __ offset(); 3278 3279 // No need to save vector registers since they are caller-saved anyway. 3280 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false); 3281 3282 int frame_complete = __ offset(); 3283 3284 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 3285 3286 __ mov(c_rarg0, r15_thread); 3287 3288 __ call(RuntimeAddress(destination)); 3289 3290 3291 // Set an oopmap for the call site. 3292 // We need this not only for callee-saved registers, but also for volatile 3293 // registers that the compiler might be keeping live across a safepoint. 3294 3295 oop_maps->add_gc_map( __ offset() - start, map); 3296 3297 // rax contains the address we are going to jump to assuming no exception got installed 3298 3299 // clear last_Java_sp 3300 __ reset_last_Java_frame(false); 3301 // check for pending exceptions 3302 Label pending; 3303 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3304 __ jcc(Assembler::notEqual, pending); 3305 3306 // get the returned Method* 3307 __ get_vm_result_2(rbx, r15_thread); 3308 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx); 3309 3310 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 3311 3312 RegisterSaver::restore_live_registers(masm); 3313 3314 // We are back to the original state on entry and ready to go. 3315 3316 __ jmp(rax); 3317 3318 // Pending exception after the safepoint 3319 3320 __ bind(pending); 3321 3322 RegisterSaver::restore_live_registers(masm); 3323 3324 // exception pending => remove activation and forward to exception handler 3325 3326 __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD); 3327 3328 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 3329 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3330 3331 // ------------- 3332 // make sure all code is generated 3333 masm->flush(); 3334 3335 // return the blob 3336 // frame_size_words or bytes?? 3337 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); 3338 } 3339 3340 //------------------------------Montgomery multiplication------------------------ 3341 // 3342 3343 #ifndef _WINDOWS 3344 3345 // Subtract 0:b from carry:a. Return carry. 3346 static julong 3347 sub(julong a[], julong b[], julong carry, long len) { 3348 long long i = 0, cnt = len; 3349 julong tmp; 3350 asm volatile("clc; " 3351 "0: ; " 3352 "mov (%[b], %[i], 8), %[tmp]; " 3353 "sbb %[tmp], (%[a], %[i], 8); " 3354 "inc %[i]; dec %[cnt]; " 3355 "jne 0b; " 3356 "mov %[carry], %[tmp]; sbb $0, %[tmp]; " 3357 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp) 3358 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry) 3359 : "memory"); 3360 return tmp; 3361 } 3362 3363 // Multiply (unsigned) Long A by Long B, accumulating the double- 3364 // length result into the accumulator formed of T0, T1, and T2. 3365 #define MACC(A, B, T0, T1, T2) \ 3366 do { \ 3367 unsigned long hi, lo; \ 3368 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3369 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3370 : "r"(A), "a"(B) : "cc"); \ 3371 } while(0) 3372 3373 // As above, but add twice the double-length result into the 3374 // accumulator. 3375 #define MACC2(A, B, T0, T1, T2) \ 3376 do { \ 3377 unsigned long hi, lo; \ 3378 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \ 3379 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3380 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3381 : "r"(A), "a"(B) : "cc"); \ 3382 } while(0) 3383 3384 #else //_WINDOWS 3385 3386 static julong 3387 sub(julong a[], julong b[], julong carry, long len) { 3388 long i; 3389 julong tmp; 3390 unsigned char c = 1; 3391 for (i = 0; i < len; i++) { 3392 c = _addcarry_u64(c, a[i], ~b[i], &tmp); 3393 a[i] = tmp; 3394 } 3395 c = _addcarry_u64(c, carry, ~0, &tmp); 3396 return tmp; 3397 } 3398 3399 // Multiply (unsigned) Long A by Long B, accumulating the double- 3400 // length result into the accumulator formed of T0, T1, and T2. 3401 #define MACC(A, B, T0, T1, T2) \ 3402 do { \ 3403 julong hi, lo; \ 3404 lo = _umul128(A, B, &hi); \ 3405 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3406 c = _addcarry_u64(c, hi, T1, &T1); \ 3407 _addcarry_u64(c, T2, 0, &T2); \ 3408 } while(0) 3409 3410 // As above, but add twice the double-length result into the 3411 // accumulator. 3412 #define MACC2(A, B, T0, T1, T2) \ 3413 do { \ 3414 julong hi, lo; \ 3415 lo = _umul128(A, B, &hi); \ 3416 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3417 c = _addcarry_u64(c, hi, T1, &T1); \ 3418 _addcarry_u64(c, T2, 0, &T2); \ 3419 c = _addcarry_u64(0, lo, T0, &T0); \ 3420 c = _addcarry_u64(c, hi, T1, &T1); \ 3421 _addcarry_u64(c, T2, 0, &T2); \ 3422 } while(0) 3423 3424 #endif //_WINDOWS 3425 3426 // Fast Montgomery multiplication. The derivation of the algorithm is 3427 // in A Cryptographic Library for the Motorola DSP56000, 3428 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3429 3430 static void NOINLINE 3431 montgomery_multiply(julong a[], julong b[], julong n[], 3432 julong m[], julong inv, int len) { 3433 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3434 int i; 3435 3436 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply"); 3437 3438 for (i = 0; i < len; i++) { 3439 int j; 3440 for (j = 0; j < i; j++) { 3441 MACC(a[j], b[i-j], t0, t1, t2); 3442 MACC(m[j], n[i-j], t0, t1, t2); 3443 } 3444 MACC(a[i], b[0], t0, t1, t2); 3445 m[i] = t0 * inv; 3446 MACC(m[i], n[0], t0, t1, t2); 3447 3448 assert(t0 == 0, "broken Montgomery multiply"); 3449 3450 t0 = t1; t1 = t2; t2 = 0; 3451 } 3452 3453 for (i = len; i < 2*len; i++) { 3454 int j; 3455 for (j = i-len+1; j < len; j++) { 3456 MACC(a[j], b[i-j], t0, t1, t2); 3457 MACC(m[j], n[i-j], t0, t1, t2); 3458 } 3459 m[i-len] = t0; 3460 t0 = t1; t1 = t2; t2 = 0; 3461 } 3462 3463 while (t0) 3464 t0 = sub(m, n, t0, len); 3465 } 3466 3467 // Fast Montgomery squaring. This uses asymptotically 25% fewer 3468 // multiplies so it should be up to 25% faster than Montgomery 3469 // multiplication. However, its loop control is more complex and it 3470 // may actually run slower on some machines. 3471 3472 static void NOINLINE 3473 montgomery_square(julong a[], julong n[], 3474 julong m[], julong inv, int len) { 3475 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3476 int i; 3477 3478 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square"); 3479 3480 for (i = 0; i < len; i++) { 3481 int j; 3482 int end = (i+1)/2; 3483 for (j = 0; j < end; j++) { 3484 MACC2(a[j], a[i-j], t0, t1, t2); 3485 MACC(m[j], n[i-j], t0, t1, t2); 3486 } 3487 if ((i & 1) == 0) { 3488 MACC(a[j], a[j], t0, t1, t2); 3489 } 3490 for (; j < i; j++) { 3491 MACC(m[j], n[i-j], t0, t1, t2); 3492 } 3493 m[i] = t0 * inv; 3494 MACC(m[i], n[0], t0, t1, t2); 3495 3496 assert(t0 == 0, "broken Montgomery square"); 3497 3498 t0 = t1; t1 = t2; t2 = 0; 3499 } 3500 3501 for (i = len; i < 2*len; i++) { 3502 int start = i-len+1; 3503 int end = start + (len - start)/2; 3504 int j; 3505 for (j = start; j < end; j++) { 3506 MACC2(a[j], a[i-j], t0, t1, t2); 3507 MACC(m[j], n[i-j], t0, t1, t2); 3508 } 3509 if ((i & 1) == 0) { 3510 MACC(a[j], a[j], t0, t1, t2); 3511 } 3512 for (; j < len; j++) { 3513 MACC(m[j], n[i-j], t0, t1, t2); 3514 } 3515 m[i-len] = t0; 3516 t0 = t1; t1 = t2; t2 = 0; 3517 } 3518 3519 while (t0) 3520 t0 = sub(m, n, t0, len); 3521 } 3522 3523 // Swap words in a longword. 3524 static julong swap(julong x) { 3525 return (x << 32) | (x >> 32); 3526 } 3527 3528 // Copy len longwords from s to d, word-swapping as we go. The 3529 // destination array is reversed. 3530 static void reverse_words(julong *s, julong *d, int len) { 3531 d += len; 3532 while(len-- > 0) { 3533 d--; 3534 *d = swap(*s); 3535 s++; 3536 } 3537 } 3538 3539 // The threshold at which squaring is advantageous was determined 3540 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. 3541 #define MONTGOMERY_SQUARING_THRESHOLD 64 3542 3543 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, 3544 jint len, jlong inv, 3545 jint *m_ints) { 3546 assert(len % 2 == 0, "array length in montgomery_multiply must be even"); 3547 int longwords = len/2; 3548 3549 // Make very sure we don't use so much space that the stack might 3550 // overflow. 512 jints corresponds to an 16384-bit integer and 3551 // will use here a total of 8k bytes of stack space. 3552 int divisor = sizeof(julong) * 4; 3553 guarantee(longwords <= 8192 / divisor, "must be"); 3554 int total_allocation = longwords * sizeof (julong) * 4; 3555 julong *scratch = (julong *)alloca(total_allocation); 3556 3557 // Local scratch arrays 3558 julong 3559 *a = scratch + 0 * longwords, 3560 *b = scratch + 1 * longwords, 3561 *n = scratch + 2 * longwords, 3562 *m = scratch + 3 * longwords; 3563 3564 reverse_words((julong *)a_ints, a, longwords); 3565 reverse_words((julong *)b_ints, b, longwords); 3566 reverse_words((julong *)n_ints, n, longwords); 3567 3568 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords); 3569 3570 reverse_words(m, (julong *)m_ints, longwords); 3571 } 3572 3573 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, 3574 jint len, jlong inv, 3575 jint *m_ints) { 3576 assert(len % 2 == 0, "array length in montgomery_square must be even"); 3577 int longwords = len/2; 3578 3579 // Make very sure we don't use so much space that the stack might 3580 // overflow. 512 jints corresponds to an 16384-bit integer and 3581 // will use here a total of 6k bytes of stack space. 3582 int divisor = sizeof(julong) * 3; 3583 guarantee(longwords <= (8192 / divisor), "must be"); 3584 int total_allocation = longwords * sizeof (julong) * 3; 3585 julong *scratch = (julong *)alloca(total_allocation); 3586 3587 // Local scratch arrays 3588 julong 3589 *a = scratch + 0 * longwords, 3590 *n = scratch + 1 * longwords, 3591 *m = scratch + 2 * longwords; 3592 3593 reverse_words((julong *)a_ints, a, longwords); 3594 reverse_words((julong *)n_ints, n, longwords); 3595 3596 if (len >= MONTGOMERY_SQUARING_THRESHOLD) { 3597 ::montgomery_square(a, n, m, (julong)inv, longwords); 3598 } else { 3599 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords); 3600 } 3601 3602 reverse_words(m, (julong *)m_ints, longwords); 3603 } 3604 3605 #ifdef COMPILER2 3606 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame 3607 // 3608 //------------------------------generate_exception_blob--------------------------- 3609 // creates exception blob at the end 3610 // Using exception blob, this code is jumped from a compiled method. 3611 // (see emit_exception_handler in x86_64.ad file) 3612 // 3613 // Given an exception pc at a call we call into the runtime for the 3614 // handler in this method. This handler might merely restore state 3615 // (i.e. callee save registers) unwind the frame and jump to the 3616 // exception handler for the nmethod if there is no Java level handler 3617 // for the nmethod. 3618 // 3619 // This code is entered with a jmp. 3620 // 3621 // Arguments: 3622 // rax: exception oop 3623 // rdx: exception pc 3624 // 3625 // Results: 3626 // rax: exception oop 3627 // rdx: exception pc in caller or ??? 3628 // destination: exception handler of caller 3629 // 3630 // Note: the exception pc MUST be at a call (precise debug information) 3631 // Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved. 3632 // 3633 3634 void OptoRuntime::generate_exception_blob() { 3635 assert(!OptoRuntime::is_callee_saved_register(RDX_num), ""); 3636 assert(!OptoRuntime::is_callee_saved_register(RAX_num), ""); 3637 assert(!OptoRuntime::is_callee_saved_register(RCX_num), ""); 3638 3639 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 3640 3641 // Allocate space for the code 3642 ResourceMark rm; 3643 // Setup code generation tools 3644 CodeBuffer buffer("exception_blob", 2048, 1024); 3645 MacroAssembler* masm = new MacroAssembler(&buffer); 3646 3647 3648 address start = __ pc(); 3649 3650 // Exception pc is 'return address' for stack walker 3651 __ push(rdx); 3652 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog 3653 3654 // Save callee-saved registers. See x86_64.ad. 3655 3656 // rbp is an implicitly saved callee saved register (i.e., the calling 3657 // convention will save/restore it in the prolog/epilog). Other than that 3658 // there are no callee save registers now that adapter frames are gone. 3659 3660 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 3661 3662 // Store exception in Thread object. We cannot pass any arguments to the 3663 // handle_exception call, since we do not want to make any assumption 3664 // about the size of the frame where the exception happened in. 3665 // c_rarg0 is either rdi (Linux) or rcx (Windows). 3666 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax); 3667 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 3668 3669 // This call does all the hard work. It checks if an exception handler 3670 // exists in the method. 3671 // If so, it returns the handler address. 3672 // If not, it prepares for stack-unwinding, restoring the callee-save 3673 // registers of the frame being removed. 3674 // 3675 // address OptoRuntime::handle_exception_C(JavaThread* thread) 3676 3677 // At a method handle call, the stack may not be properly aligned 3678 // when returning with an exception. 3679 address the_pc = __ pc(); 3680 __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1); 3681 __ mov(c_rarg0, r15_thread); 3682 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack 3683 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C))); 3684 3685 // Set an oopmap for the call site. This oopmap will only be used if we 3686 // are unwinding the stack. Hence, all locations will be dead. 3687 // Callee-saved registers will be the same as the frame above (i.e., 3688 // handle_exception_stub), since they were restored when we got the 3689 // exception. 3690 3691 OopMapSet* oop_maps = new OopMapSet(); 3692 3693 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 3694 3695 __ reset_last_Java_frame(false); 3696 3697 // Restore callee-saved registers 3698 3699 // rbp is an implicitly saved callee-saved register (i.e., the calling 3700 // convention will save restore it in prolog/epilog) Other than that 3701 // there are no callee save registers now that adapter frames are gone. 3702 3703 __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt)); 3704 3705 __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog 3706 __ pop(rdx); // No need for exception pc anymore 3707 3708 // rax: exception handler 3709 3710 // We have a handler in rax (could be deopt blob). 3711 __ mov(r8, rax); 3712 3713 // Get the exception oop 3714 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 3715 // Get the exception pc in case we are deoptimized 3716 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 3717 #ifdef ASSERT 3718 __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD); 3719 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 3720 #endif 3721 // Clear the exception oop so GC no longer processes it as a root. 3722 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD); 3723 3724 // rax: exception oop 3725 // r8: exception handler 3726 // rdx: exception pc 3727 // Jump to handler 3728 3729 __ jmp(r8); 3730 3731 // Make sure all code is generated 3732 masm->flush(); 3733 3734 // Set exception blob 3735 _exception_blob = ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1); 3736 } 3737 #endif // COMPILER2