1 /* 2 * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #ifndef _WINDOWS 27 #include "alloca.h" 28 #endif 29 #include "asm/macroAssembler.hpp" 30 #include "asm/macroAssembler.inline.hpp" 31 #include "code/compiledIC.hpp" 32 #include "code/debugInfoRec.hpp" 33 #include "code/icBuffer.hpp" 34 #include "code/nativeInst.hpp" 35 #include "code/vtableStubs.hpp" 36 #include "compiler/oopMap.hpp" 37 #include "gc/shared/collectedHeap.hpp" 38 #include "gc/shared/gcLocker.hpp" 39 #include "gc/shared/barrierSet.hpp" 40 #include "gc/shared/barrierSetAssembler.hpp" 41 #include "interpreter/interpreter.hpp" 42 #include "logging/log.hpp" 43 #include "memory/resourceArea.hpp" 44 #include "memory/universe.hpp" 45 #include "oops/compiledICHolder.hpp" 46 #include "oops/klass.inline.hpp" 47 #include "oops/method.inline.hpp" 48 #include "prims/methodHandles.hpp" 49 #include "runtime/continuation.hpp" 50 #include "runtime/continuationEntry.inline.hpp" 51 #include "runtime/globals.hpp" 52 #include "runtime/jniHandles.hpp" 53 #include "runtime/safepointMechanism.hpp" 54 #include "runtime/sharedRuntime.hpp" 55 #include "runtime/signature.hpp" 56 #include "runtime/stubRoutines.hpp" 57 #include "runtime/vframeArray.hpp" 58 #include "runtime/vm_version.hpp" 59 #include "utilities/align.hpp" 60 #include "utilities/checkedCast.hpp" 61 #include "utilities/formatBuffer.hpp" 62 #include "vmreg_x86.inline.hpp" 63 #ifdef COMPILER1 64 #include "c1/c1_Runtime1.hpp" 65 #endif 66 #ifdef COMPILER2 67 #include "opto/runtime.hpp" 68 #endif 69 #if INCLUDE_JVMCI 70 #include "jvmci/jvmciJavaClasses.hpp" 71 #endif 72 73 #define __ masm-> 74 75 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size; 76 77 class SimpleRuntimeFrame { 78 79 public: 80 81 // Most of the runtime stubs have this simple frame layout. 82 // This class exists to make the layout shared in one place. 83 // Offsets are for compiler stack slots, which are jints. 84 enum layout { 85 // The frame sender code expects that rbp will be in the "natural" place and 86 // will override any oopMap setting for it. We must therefore force the layout 87 // so that it agrees with the frame sender code. 88 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt, 89 rbp_off2, 90 return_off, return_off2, 91 framesize 92 }; 93 }; 94 95 class RegisterSaver { 96 // Capture info about frame layout. Layout offsets are in jint 97 // units because compiler frame slots are jints. 98 #define XSAVE_AREA_BEGIN 160 99 #define XSAVE_AREA_YMM_BEGIN 576 100 #define XSAVE_AREA_OPMASK_BEGIN 1088 101 #define XSAVE_AREA_ZMM_BEGIN 1152 102 #define XSAVE_AREA_UPPERBANK 1664 103 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off 104 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off 105 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off 106 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off 107 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off 108 enum layout { 109 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area 110 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area 111 DEF_XMM_OFFS(0), 112 DEF_XMM_OFFS(1), 113 // 2..15 are implied in range usage 114 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 115 DEF_YMM_OFFS(0), 116 DEF_YMM_OFFS(1), 117 // 2..15 are implied in range usage 118 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 119 DEF_OPMASK_OFFS(0), 120 DEF_OPMASK_OFFS(1), 121 // 2..7 are implied in range usage 122 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 123 DEF_ZMM_OFFS(0), 124 DEF_ZMM_OFFS(1), 125 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt, 126 DEF_ZMM_UPPER_OFFS(16), 127 DEF_ZMM_UPPER_OFFS(17), 128 // 18..31 are implied in range usage 129 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt), 130 fpu_stateH_end, 131 r15_off, r15H_off, 132 r14_off, r14H_off, 133 r13_off, r13H_off, 134 r12_off, r12H_off, 135 r11_off, r11H_off, 136 r10_off, r10H_off, 137 r9_off, r9H_off, 138 r8_off, r8H_off, 139 rdi_off, rdiH_off, 140 rsi_off, rsiH_off, 141 ignore_off, ignoreH_off, // extra copy of rbp 142 rsp_off, rspH_off, 143 rbx_off, rbxH_off, 144 rdx_off, rdxH_off, 145 rcx_off, rcxH_off, 146 rax_off, raxH_off, 147 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state 148 align_off, alignH_off, 149 flags_off, flagsH_off, 150 // The frame sender code expects that rbp will be in the "natural" place and 151 // will override any oopMap setting for it. We must therefore force the layout 152 // so that it agrees with the frame sender code. 153 rbp_off, rbpH_off, // copy of rbp we will restore 154 return_off, returnH_off, // slot for return address 155 reg_save_size // size in compiler stack slots 156 }; 157 158 public: 159 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors); 160 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false); 161 162 // Offsets into the register save area 163 // Used by deoptimization when it is managing result register 164 // values on its own 165 166 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; } 167 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; } 168 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; } 169 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; } 170 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; } 171 172 // During deoptimization only the result registers need to be restored, 173 // all the other values have already been extracted. 174 static void restore_result_registers(MacroAssembler* masm); 175 }; 176 177 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) { 178 int off = 0; 179 int num_xmm_regs = XMMRegister::available_xmm_registers(); 180 #if COMPILER2_OR_JVMCI 181 if (save_wide_vectors && UseAVX == 0) { 182 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX 183 } 184 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 185 #else 186 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI 187 #endif 188 189 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated 190 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs); 191 // OopMap frame size is in compiler stack slots (jint's) not bytes or words 192 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 193 // CodeBlob frame size is in words. 194 int frame_size_in_words = frame_size_in_bytes / wordSize; 195 *total_frame_words = frame_size_in_words; 196 197 // Save registers, fpu state, and flags. 198 // We assume caller has already pushed the return address onto the 199 // stack, so rsp is 8-byte aligned here. 200 // We push rpb twice in this sequence because we want the real rbp 201 // to be under the return like a normal enter. 202 203 __ enter(); // rsp becomes 16-byte aligned here 204 __ push_CPU_state(); // Push a multiple of 16 bytes 205 206 // push cpu state handles this on EVEX enabled targets 207 if (save_wide_vectors) { 208 // Save upper half of YMM registers(0..15) 209 int base_addr = XSAVE_AREA_YMM_BEGIN; 210 for (int n = 0; n < 16; n++) { 211 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n)); 212 } 213 if (VM_Version::supports_evex()) { 214 // Save upper half of ZMM registers(0..15) 215 base_addr = XSAVE_AREA_ZMM_BEGIN; 216 for (int n = 0; n < 16; n++) { 217 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n)); 218 } 219 // Save full ZMM registers(16..num_xmm_regs) 220 base_addr = XSAVE_AREA_UPPERBANK; 221 off = 0; 222 int vector_len = Assembler::AVX_512bit; 223 for (int n = 16; n < num_xmm_regs; n++) { 224 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 225 } 226 #if COMPILER2_OR_JVMCI 227 base_addr = XSAVE_AREA_OPMASK_BEGIN; 228 off = 0; 229 for(int n = 0; n < KRegister::number_of_registers; n++) { 230 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 231 } 232 #endif 233 } 234 } else { 235 if (VM_Version::supports_evex()) { 236 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 237 int base_addr = XSAVE_AREA_UPPERBANK; 238 off = 0; 239 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 240 for (int n = 16; n < num_xmm_regs; n++) { 241 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 242 } 243 #if COMPILER2_OR_JVMCI 244 base_addr = XSAVE_AREA_OPMASK_BEGIN; 245 off = 0; 246 for(int n = 0; n < KRegister::number_of_registers; n++) { 247 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 248 } 249 #endif 250 } 251 } 252 __ vzeroupper(); 253 if (frame::arg_reg_save_area_bytes != 0) { 254 // Allocate argument register save area 255 __ subptr(rsp, frame::arg_reg_save_area_bytes); 256 } 257 258 // Set an oopmap for the call site. This oopmap will map all 259 // oop-registers and debug-info registers as callee-saved. This 260 // will allow deoptimization at this safepoint to find all possible 261 // debug-info recordings, as well as let GC find all oops. 262 263 OopMapSet *oop_maps = new OopMapSet(); 264 OopMap* map = new OopMap(frame_size_in_slots, 0); 265 266 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x)) 267 268 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg()); 269 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg()); 270 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg()); 271 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg()); 272 // rbp location is known implicitly by the frame sender code, needs no oopmap 273 // and the location where rbp was saved by is ignored 274 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg()); 275 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg()); 276 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg()); 277 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg()); 278 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg()); 279 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg()); 280 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg()); 281 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg()); 282 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg()); 283 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg()); 284 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 285 // on EVEX enabled targets, we get it included in the xsave area 286 off = xmm0_off; 287 int delta = xmm1_off - off; 288 for (int n = 0; n < 16; n++) { 289 XMMRegister xmm_name = as_XMMRegister(n); 290 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()); 291 off += delta; 292 } 293 if (UseAVX > 2) { 294 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 295 off = zmm16_off; 296 delta = zmm17_off - off; 297 for (int n = 16; n < num_xmm_regs; n++) { 298 XMMRegister zmm_name = as_XMMRegister(n); 299 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()); 300 off += delta; 301 } 302 } 303 304 #if COMPILER2_OR_JVMCI 305 if (save_wide_vectors) { 306 // Save upper half of YMM registers(0..15) 307 off = ymm0_off; 308 delta = ymm1_off - ymm0_off; 309 for (int n = 0; n < 16; n++) { 310 XMMRegister ymm_name = as_XMMRegister(n); 311 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4)); 312 off += delta; 313 } 314 if (VM_Version::supports_evex()) { 315 // Save upper half of ZMM registers(0..15) 316 off = zmm0_off; 317 delta = zmm1_off - zmm0_off; 318 for (int n = 0; n < 16; n++) { 319 XMMRegister zmm_name = as_XMMRegister(n); 320 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8)); 321 off += delta; 322 } 323 } 324 } 325 #endif // COMPILER2_OR_JVMCI 326 327 // %%% These should all be a waste but we'll keep things as they were for now 328 if (true) { 329 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next()); 330 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next()); 331 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next()); 332 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next()); 333 // rbp location is known implicitly by the frame sender code, needs no oopmap 334 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next()); 335 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next()); 336 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next()); 337 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next()); 338 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next()); 339 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next()); 340 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next()); 341 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next()); 342 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next()); 343 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next()); 344 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 345 // on EVEX enabled targets, we get it included in the xsave area 346 off = xmm0H_off; 347 delta = xmm1H_off - off; 348 for (int n = 0; n < 16; n++) { 349 XMMRegister xmm_name = as_XMMRegister(n); 350 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next()); 351 off += delta; 352 } 353 if (UseAVX > 2) { 354 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 355 off = zmm16H_off; 356 delta = zmm17H_off - off; 357 for (int n = 16; n < num_xmm_regs; n++) { 358 XMMRegister zmm_name = as_XMMRegister(n); 359 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next()); 360 off += delta; 361 } 362 } 363 } 364 365 return map; 366 } 367 368 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) { 369 int num_xmm_regs = XMMRegister::available_xmm_registers(); 370 if (frame::arg_reg_save_area_bytes != 0) { 371 // Pop arg register save area 372 __ addptr(rsp, frame::arg_reg_save_area_bytes); 373 } 374 375 #if COMPILER2_OR_JVMCI 376 if (restore_wide_vectors) { 377 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); 378 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 379 } 380 #else 381 assert(!restore_wide_vectors, "vectors are generated only by C2"); 382 #endif 383 384 __ vzeroupper(); 385 386 // On EVEX enabled targets everything is handled in pop fpu state 387 if (restore_wide_vectors) { 388 // Restore upper half of YMM registers (0..15) 389 int base_addr = XSAVE_AREA_YMM_BEGIN; 390 for (int n = 0; n < 16; n++) { 391 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16)); 392 } 393 if (VM_Version::supports_evex()) { 394 // Restore upper half of ZMM registers (0..15) 395 base_addr = XSAVE_AREA_ZMM_BEGIN; 396 for (int n = 0; n < 16; n++) { 397 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32)); 398 } 399 // Restore full ZMM registers(16..num_xmm_regs) 400 base_addr = XSAVE_AREA_UPPERBANK; 401 int vector_len = Assembler::AVX_512bit; 402 int off = 0; 403 for (int n = 16; n < num_xmm_regs; n++) { 404 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 405 } 406 #if COMPILER2_OR_JVMCI 407 base_addr = XSAVE_AREA_OPMASK_BEGIN; 408 off = 0; 409 for (int n = 0; n < KRegister::number_of_registers; n++) { 410 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 411 } 412 #endif 413 } 414 } else { 415 if (VM_Version::supports_evex()) { 416 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 417 int base_addr = XSAVE_AREA_UPPERBANK; 418 int off = 0; 419 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 420 for (int n = 16; n < num_xmm_regs; n++) { 421 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 422 } 423 #if COMPILER2_OR_JVMCI 424 base_addr = XSAVE_AREA_OPMASK_BEGIN; 425 off = 0; 426 for (int n = 0; n < KRegister::number_of_registers; n++) { 427 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 428 } 429 #endif 430 } 431 } 432 433 // Recover CPU state 434 __ pop_CPU_state(); 435 // Get the rbp described implicitly by the calling convention (no oopMap) 436 __ pop(rbp); 437 } 438 439 void RegisterSaver::restore_result_registers(MacroAssembler* masm) { 440 441 // Just restore result register. Only used by deoptimization. By 442 // now any callee save register that needs to be restored to a c2 443 // caller of the deoptee has been extracted into the vframeArray 444 // and will be stuffed into the c2i adapter we create for later 445 // restoration so only result registers need to be restored here. 446 447 // Restore fp result register 448 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes())); 449 // Restore integer result register 450 __ movptr(rax, Address(rsp, rax_offset_in_bytes())); 451 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes())); 452 453 // Pop all of the register save are off the stack except the return address 454 __ addptr(rsp, return_offset_in_bytes()); 455 } 456 457 // Is vector's size (in bytes) bigger than a size saved by default? 458 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions. 459 bool SharedRuntime::is_wide_vector(int size) { 460 return size > 16; 461 } 462 463 // --------------------------------------------------------------------------- 464 // Read the array of BasicTypes from a signature, and compute where the 465 // arguments should go. Values in the VMRegPair regs array refer to 4-byte 466 // quantities. Values less than VMRegImpl::stack0 are registers, those above 467 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer 468 // as framesizes are fixed. 469 // VMRegImpl::stack0 refers to the first slot 0(sp). 470 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher. 471 // Register up to Register::number_of_registers are the 64-bit 472 // integer registers. 473 474 // Note: the INPUTS in sig_bt are in units of Java argument words, which are 475 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit 476 // units regardless of build. Of course for i486 there is no 64 bit build 477 478 // The Java calling convention is a "shifted" version of the C ABI. 479 // By skipping the first C ABI register we can call non-static jni methods 480 // with small numbers of arguments without having to shuffle the arguments 481 // at all. Since we control the java ABI we ought to at least get some 482 // advantage out of it. 483 484 int SharedRuntime::java_calling_convention(const BasicType *sig_bt, 485 VMRegPair *regs, 486 int total_args_passed) { 487 488 // Create the mapping between argument positions and 489 // registers. 490 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = { 491 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5 492 }; 493 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = { 494 j_farg0, j_farg1, j_farg2, j_farg3, 495 j_farg4, j_farg5, j_farg6, j_farg7 496 }; 497 498 499 uint int_args = 0; 500 uint fp_args = 0; 501 uint stk_args = 0; // inc by 2 each time 502 503 for (int i = 0; i < total_args_passed; i++) { 504 switch (sig_bt[i]) { 505 case T_BOOLEAN: 506 case T_CHAR: 507 case T_BYTE: 508 case T_SHORT: 509 case T_INT: 510 if (int_args < Argument::n_int_register_parameters_j) { 511 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 512 } else { 513 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 514 stk_args += 2; 515 } 516 break; 517 case T_VOID: 518 // halves of T_LONG or T_DOUBLE 519 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 520 regs[i].set_bad(); 521 break; 522 case T_LONG: 523 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 524 // fall through 525 case T_OBJECT: 526 case T_ARRAY: 527 case T_ADDRESS: 528 if (int_args < Argument::n_int_register_parameters_j) { 529 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 530 } else { 531 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 532 stk_args += 2; 533 } 534 break; 535 case T_FLOAT: 536 if (fp_args < Argument::n_float_register_parameters_j) { 537 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 538 } else { 539 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 540 stk_args += 2; 541 } 542 break; 543 case T_DOUBLE: 544 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 545 if (fp_args < Argument::n_float_register_parameters_j) { 546 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 547 } else { 548 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 549 stk_args += 2; 550 } 551 break; 552 default: 553 ShouldNotReachHere(); 554 break; 555 } 556 } 557 558 return align_up(stk_args, 2); 559 } 560 561 // Patch the callers callsite with entry to compiled code if it exists. 562 static void patch_callers_callsite(MacroAssembler *masm) { 563 Label L; 564 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 565 __ jcc(Assembler::equal, L); 566 567 // Save the current stack pointer 568 __ mov(r13, rsp); 569 // Schedule the branch target address early. 570 // Call into the VM to patch the caller, then jump to compiled callee 571 // rax isn't live so capture return address while we easily can 572 __ movptr(rax, Address(rsp, 0)); 573 574 // align stack so push_CPU_state doesn't fault 575 __ andptr(rsp, -(StackAlignmentInBytes)); 576 __ push_CPU_state(); 577 __ vzeroupper(); 578 // VM needs caller's callsite 579 // VM needs target method 580 // This needs to be a long call since we will relocate this adapter to 581 // the codeBuffer and it may not reach 582 583 // Allocate argument register save area 584 if (frame::arg_reg_save_area_bytes != 0) { 585 __ subptr(rsp, frame::arg_reg_save_area_bytes); 586 } 587 __ mov(c_rarg0, rbx); 588 __ mov(c_rarg1, rax); 589 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite))); 590 591 // De-allocate argument register save area 592 if (frame::arg_reg_save_area_bytes != 0) { 593 __ addptr(rsp, frame::arg_reg_save_area_bytes); 594 } 595 596 __ vzeroupper(); 597 __ pop_CPU_state(); 598 // restore sp 599 __ mov(rsp, r13); 600 __ bind(L); 601 } 602 603 604 static void gen_c2i_adapter(MacroAssembler *masm, 605 int total_args_passed, 606 int comp_args_on_stack, 607 const BasicType *sig_bt, 608 const VMRegPair *regs, 609 Label& skip_fixup) { 610 // Before we get into the guts of the C2I adapter, see if we should be here 611 // at all. We've come from compiled code and are attempting to jump to the 612 // interpreter, which means the caller made a static call to get here 613 // (vcalls always get a compiled target if there is one). Check for a 614 // compiled target. If there is one, we need to patch the caller's call. 615 patch_callers_callsite(masm); 616 617 __ bind(skip_fixup); 618 619 // Since all args are passed on the stack, total_args_passed * 620 // Interpreter::stackElementSize is the space we need. 621 622 assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed); 623 624 int extraspace = (total_args_passed * Interpreter::stackElementSize); 625 626 // stack is aligned, keep it that way 627 // This is not currently needed or enforced by the interpreter, but 628 // we might as well conform to the ABI. 629 extraspace = align_up(extraspace, 2*wordSize); 630 631 // set senderSP value 632 __ lea(r13, Address(rsp, wordSize)); 633 634 #ifdef ASSERT 635 __ check_stack_alignment(r13, "sender stack not aligned"); 636 #endif 637 if (extraspace > 0) { 638 // Pop the return address 639 __ pop(rax); 640 641 __ subptr(rsp, extraspace); 642 643 // Push the return address 644 __ push(rax); 645 646 // Account for the return address location since we store it first rather 647 // than hold it in a register across all the shuffling 648 extraspace += wordSize; 649 } 650 651 #ifdef ASSERT 652 __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax); 653 #endif 654 655 // Now write the args into the outgoing interpreter space 656 for (int i = 0; i < total_args_passed; i++) { 657 if (sig_bt[i] == T_VOID) { 658 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 659 continue; 660 } 661 662 // offset to start parameters 663 int st_off = (total_args_passed - i) * Interpreter::stackElementSize; 664 int next_off = st_off - Interpreter::stackElementSize; 665 666 // Say 4 args: 667 // i st_off 668 // 0 32 T_LONG 669 // 1 24 T_VOID 670 // 2 16 T_OBJECT 671 // 3 8 T_BOOL 672 // - 0 return address 673 // 674 // However to make thing extra confusing. Because we can fit a long/double in 675 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter 676 // leaves one slot empty and only stores to a single slot. In this case the 677 // slot that is occupied is the T_VOID slot. See I said it was confusing. 678 679 VMReg r_1 = regs[i].first(); 680 VMReg r_2 = regs[i].second(); 681 if (!r_1->is_valid()) { 682 assert(!r_2->is_valid(), ""); 683 continue; 684 } 685 if (r_1->is_stack()) { 686 // memory to memory use rax 687 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace; 688 if (!r_2->is_valid()) { 689 // sign extend?? 690 __ movl(rax, Address(rsp, ld_off)); 691 __ movptr(Address(rsp, st_off), rax); 692 693 } else { 694 695 __ movq(rax, Address(rsp, ld_off)); 696 697 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 698 // T_DOUBLE and T_LONG use two slots in the interpreter 699 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 700 // ld_off == LSW, ld_off+wordSize == MSW 701 // st_off == MSW, next_off == LSW 702 __ movq(Address(rsp, next_off), rax); 703 #ifdef ASSERT 704 // Overwrite the unused slot with known junk 705 __ mov64(rax, CONST64(0xdeadffffdeadaaaa)); 706 __ movptr(Address(rsp, st_off), rax); 707 #endif /* ASSERT */ 708 } else { 709 __ movq(Address(rsp, st_off), rax); 710 } 711 } 712 } else if (r_1->is_Register()) { 713 Register r = r_1->as_Register(); 714 if (!r_2->is_valid()) { 715 // must be only an int (or less ) so move only 32bits to slot 716 // why not sign extend?? 717 __ movl(Address(rsp, st_off), r); 718 } else { 719 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 720 // T_DOUBLE and T_LONG use two slots in the interpreter 721 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 722 // long/double in gpr 723 #ifdef ASSERT 724 // Overwrite the unused slot with known junk 725 __ mov64(rax, CONST64(0xdeadffffdeadaaab)); 726 __ movptr(Address(rsp, st_off), rax); 727 #endif /* ASSERT */ 728 __ movq(Address(rsp, next_off), r); 729 } else { 730 __ movptr(Address(rsp, st_off), r); 731 } 732 } 733 } else { 734 assert(r_1->is_XMMRegister(), ""); 735 if (!r_2->is_valid()) { 736 // only a float use just part of the slot 737 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister()); 738 } else { 739 #ifdef ASSERT 740 // Overwrite the unused slot with known junk 741 __ mov64(rax, CONST64(0xdeadffffdeadaaac)); 742 __ movptr(Address(rsp, st_off), rax); 743 #endif /* ASSERT */ 744 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister()); 745 } 746 } 747 } 748 749 // Schedule the branch target address early. 750 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset()))); 751 __ jmp(rcx); 752 } 753 754 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg, 755 address code_start, address code_end, 756 Label& L_ok) { 757 Label L_fail; 758 __ lea(temp_reg, ExternalAddress(code_start)); 759 __ cmpptr(pc_reg, temp_reg); 760 __ jcc(Assembler::belowEqual, L_fail); 761 __ lea(temp_reg, ExternalAddress(code_end)); 762 __ cmpptr(pc_reg, temp_reg); 763 __ jcc(Assembler::below, L_ok); 764 __ bind(L_fail); 765 } 766 767 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm, 768 int total_args_passed, 769 int comp_args_on_stack, 770 const BasicType *sig_bt, 771 const VMRegPair *regs) { 772 773 // Note: r13 contains the senderSP on entry. We must preserve it since 774 // we may do a i2c -> c2i transition if we lose a race where compiled 775 // code goes non-entrant while we get args ready. 776 // In addition we use r13 to locate all the interpreter args as 777 // we must align the stack to 16 bytes on an i2c entry else we 778 // lose alignment we expect in all compiled code and register 779 // save code can segv when fxsave instructions find improperly 780 // aligned stack pointer. 781 782 // Adapters can be frameless because they do not require the caller 783 // to perform additional cleanup work, such as correcting the stack pointer. 784 // An i2c adapter is frameless because the *caller* frame, which is interpreted, 785 // routinely repairs its own stack pointer (from interpreter_frame_last_sp), 786 // even if a callee has modified the stack pointer. 787 // A c2i adapter is frameless because the *callee* frame, which is interpreted, 788 // routinely repairs its caller's stack pointer (from sender_sp, which is set 789 // up via the senderSP register). 790 // In other words, if *either* the caller or callee is interpreted, we can 791 // get the stack pointer repaired after a call. 792 // This is why c2i and i2c adapters cannot be indefinitely composed. 793 // In particular, if a c2i adapter were to somehow call an i2c adapter, 794 // both caller and callee would be compiled methods, and neither would 795 // clean up the stack pointer changes performed by the two adapters. 796 // If this happens, control eventually transfers back to the compiled 797 // caller, but with an uncorrected stack, causing delayed havoc. 798 799 if (VerifyAdapterCalls && 800 (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) { 801 // So, let's test for cascading c2i/i2c adapters right now. 802 // assert(Interpreter::contains($return_addr) || 803 // StubRoutines::contains($return_addr), 804 // "i2c adapter must return to an interpreter frame"); 805 __ block_comment("verify_i2c { "); 806 // Pick up the return address 807 __ movptr(rax, Address(rsp, 0)); 808 Label L_ok; 809 if (Interpreter::code() != nullptr) { 810 range_check(masm, rax, r11, 811 Interpreter::code()->code_start(), 812 Interpreter::code()->code_end(), 813 L_ok); 814 } 815 if (StubRoutines::initial_stubs_code() != nullptr) { 816 range_check(masm, rax, r11, 817 StubRoutines::initial_stubs_code()->code_begin(), 818 StubRoutines::initial_stubs_code()->code_end(), 819 L_ok); 820 } 821 if (StubRoutines::final_stubs_code() != nullptr) { 822 range_check(masm, rax, r11, 823 StubRoutines::final_stubs_code()->code_begin(), 824 StubRoutines::final_stubs_code()->code_end(), 825 L_ok); 826 } 827 const char* msg = "i2c adapter must return to an interpreter frame"; 828 __ block_comment(msg); 829 __ stop(msg); 830 __ bind(L_ok); 831 __ block_comment("} verify_i2ce "); 832 } 833 834 // Must preserve original SP for loading incoming arguments because 835 // we need to align the outgoing SP for compiled code. 836 __ movptr(r11, rsp); 837 838 // Pick up the return address 839 __ pop(rax); 840 841 // Convert 4-byte c2 stack slots to words. 842 int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord; 843 844 if (comp_args_on_stack) { 845 __ subptr(rsp, comp_words_on_stack * wordSize); 846 } 847 848 // Ensure compiled code always sees stack at proper alignment 849 __ andptr(rsp, -16); 850 851 // push the return address and misalign the stack that youngest frame always sees 852 // as far as the placement of the call instruction 853 __ push(rax); 854 855 // Put saved SP in another register 856 const Register saved_sp = rax; 857 __ movptr(saved_sp, r11); 858 859 // Will jump to the compiled code just as if compiled code was doing it. 860 // Pre-load the register-jump target early, to schedule it better. 861 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset()))); 862 863 #if INCLUDE_JVMCI 864 if (EnableJVMCI) { 865 // check if this call should be routed towards a specific entry point 866 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 867 Label no_alternative_target; 868 __ jcc(Assembler::equal, no_alternative_target); 869 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset()))); 870 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 871 __ bind(no_alternative_target); 872 } 873 #endif // INCLUDE_JVMCI 874 875 // Now generate the shuffle code. Pick up all register args and move the 876 // rest through the floating point stack top. 877 for (int i = 0; i < total_args_passed; i++) { 878 if (sig_bt[i] == T_VOID) { 879 // Longs and doubles are passed in native word order, but misaligned 880 // in the 32-bit build. 881 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 882 continue; 883 } 884 885 // Pick up 0, 1 or 2 words from SP+offset. 886 887 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), 888 "scrambled load targets?"); 889 // Load in argument order going down. 890 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize; 891 // Point to interpreter value (vs. tag) 892 int next_off = ld_off - Interpreter::stackElementSize; 893 // 894 // 895 // 896 VMReg r_1 = regs[i].first(); 897 VMReg r_2 = regs[i].second(); 898 if (!r_1->is_valid()) { 899 assert(!r_2->is_valid(), ""); 900 continue; 901 } 902 if (r_1->is_stack()) { 903 // Convert stack slot to an SP offset (+ wordSize to account for return address ) 904 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize; 905 906 // We can use r13 as a temp here because compiled code doesn't need r13 as an input 907 // and if we end up going thru a c2i because of a miss a reasonable value of r13 908 // will be generated. 909 if (!r_2->is_valid()) { 910 // sign extend??? 911 __ movl(r13, Address(saved_sp, ld_off)); 912 __ movptr(Address(rsp, st_off), r13); 913 } else { 914 // 915 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 916 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 917 // So we must adjust where to pick up the data to match the interpreter. 918 // 919 // Interpreter local[n] == MSW, local[n+1] == LSW however locals 920 // are accessed as negative so LSW is at LOW address 921 922 // ld_off is MSW so get LSW 923 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 924 next_off : ld_off; 925 __ movq(r13, Address(saved_sp, offset)); 926 // st_off is LSW (i.e. reg.first()) 927 __ movq(Address(rsp, st_off), r13); 928 } 929 } else if (r_1->is_Register()) { // Register argument 930 Register r = r_1->as_Register(); 931 assert(r != rax, "must be different"); 932 if (r_2->is_valid()) { 933 // 934 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 935 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 936 // So we must adjust where to pick up the data to match the interpreter. 937 938 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 939 next_off : ld_off; 940 941 // this can be a misaligned move 942 __ movq(r, Address(saved_sp, offset)); 943 } else { 944 // sign extend and use a full word? 945 __ movl(r, Address(saved_sp, ld_off)); 946 } 947 } else { 948 if (!r_2->is_valid()) { 949 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off)); 950 } else { 951 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off)); 952 } 953 } 954 } 955 956 __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about 957 958 // 6243940 We might end up in handle_wrong_method if 959 // the callee is deoptimized as we race thru here. If that 960 // happens we don't want to take a safepoint because the 961 // caller frame will look interpreted and arguments are now 962 // "compiled" so it is much better to make this transition 963 // invisible to the stack walking code. Unfortunately if 964 // we try and find the callee by normal means a safepoint 965 // is possible. So we stash the desired callee in the thread 966 // and the vm will find there should this case occur. 967 968 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx); 969 970 // put Method* where a c2i would expect should we end up there 971 // only needed because eof c2 resolve stubs return Method* as a result in 972 // rax 973 __ mov(rax, rbx); 974 __ jmp(r11); 975 } 976 977 // --------------------------------------------------------------- 978 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm, 979 int total_args_passed, 980 int comp_args_on_stack, 981 const BasicType *sig_bt, 982 const VMRegPair *regs, 983 AdapterFingerPrint* fingerprint) { 984 address i2c_entry = __ pc(); 985 986 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs); 987 988 // ------------------------------------------------------------------------- 989 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls 990 // to the interpreter. The args start out packed in the compiled layout. They 991 // need to be unpacked into the interpreter layout. This will almost always 992 // require some stack space. We grow the current (compiled) stack, then repack 993 // the args. We finally end in a jump to the generic interpreter entry point. 994 // On exit from the interpreter, the interpreter will restore our SP (lest the 995 // compiled code, which relies solely on SP and not RBP, get sick). 996 997 address c2i_unverified_entry = __ pc(); 998 Label skip_fixup; 999 Label ok; 1000 1001 Register holder = rax; 1002 Register receiver = j_rarg0; 1003 Register temp = rbx; 1004 1005 { 1006 __ load_klass(temp, receiver, rscratch1); 1007 __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset())); 1008 __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset())); 1009 __ jcc(Assembler::equal, ok); 1010 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1011 1012 __ bind(ok); 1013 // Method might have been compiled since the call site was patched to 1014 // interpreted if that is the case treat it as a miss so we can get 1015 // the call site corrected. 1016 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 1017 __ jcc(Assembler::equal, skip_fixup); 1018 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1019 } 1020 1021 address c2i_entry = __ pc(); 1022 1023 // Class initialization barrier for static methods 1024 address c2i_no_clinit_check_entry = nullptr; 1025 if (VM_Version::supports_fast_class_init_checks()) { 1026 Label L_skip_barrier; 1027 Register method = rbx; 1028 1029 { // Bypass the barrier for non-static methods 1030 Register flags = rscratch1; 1031 __ movl(flags, Address(method, Method::access_flags_offset())); 1032 __ testl(flags, JVM_ACC_STATIC); 1033 __ jcc(Assembler::zero, L_skip_barrier); // non-static 1034 } 1035 1036 Register klass = rscratch1; 1037 __ load_method_holder(klass, method); 1038 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1039 1040 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1041 1042 __ bind(L_skip_barrier); 1043 c2i_no_clinit_check_entry = __ pc(); 1044 } 1045 1046 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1047 bs->c2i_entry_barrier(masm); 1048 1049 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup); 1050 1051 return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry); 1052 } 1053 1054 int SharedRuntime::c_calling_convention(const BasicType *sig_bt, 1055 VMRegPair *regs, 1056 int total_args_passed) { 1057 1058 // We return the amount of VMRegImpl stack slots we need to reserve for all 1059 // the arguments NOT counting out_preserve_stack_slots. 1060 1061 // NOTE: These arrays will have to change when c1 is ported 1062 #ifdef _WIN64 1063 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1064 c_rarg0, c_rarg1, c_rarg2, c_rarg3 1065 }; 1066 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1067 c_farg0, c_farg1, c_farg2, c_farg3 1068 }; 1069 #else 1070 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1071 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5 1072 }; 1073 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1074 c_farg0, c_farg1, c_farg2, c_farg3, 1075 c_farg4, c_farg5, c_farg6, c_farg7 1076 }; 1077 #endif // _WIN64 1078 1079 1080 uint int_args = 0; 1081 uint fp_args = 0; 1082 uint stk_args = 0; // inc by 2 each time 1083 1084 for (int i = 0; i < total_args_passed; i++) { 1085 switch (sig_bt[i]) { 1086 case T_BOOLEAN: 1087 case T_CHAR: 1088 case T_BYTE: 1089 case T_SHORT: 1090 case T_INT: 1091 if (int_args < Argument::n_int_register_parameters_c) { 1092 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 1093 #ifdef _WIN64 1094 fp_args++; 1095 // Allocate slots for callee to stuff register args the stack. 1096 stk_args += 2; 1097 #endif 1098 } else { 1099 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1100 stk_args += 2; 1101 } 1102 break; 1103 case T_LONG: 1104 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1105 // fall through 1106 case T_OBJECT: 1107 case T_ARRAY: 1108 case T_ADDRESS: 1109 case T_METADATA: 1110 if (int_args < Argument::n_int_register_parameters_c) { 1111 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 1112 #ifdef _WIN64 1113 fp_args++; 1114 stk_args += 2; 1115 #endif 1116 } else { 1117 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1118 stk_args += 2; 1119 } 1120 break; 1121 case T_FLOAT: 1122 if (fp_args < Argument::n_float_register_parameters_c) { 1123 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 1124 #ifdef _WIN64 1125 int_args++; 1126 // Allocate slots for callee to stuff register args the stack. 1127 stk_args += 2; 1128 #endif 1129 } else { 1130 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1131 stk_args += 2; 1132 } 1133 break; 1134 case T_DOUBLE: 1135 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1136 if (fp_args < Argument::n_float_register_parameters_c) { 1137 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 1138 #ifdef _WIN64 1139 int_args++; 1140 // Allocate slots for callee to stuff register args the stack. 1141 stk_args += 2; 1142 #endif 1143 } else { 1144 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1145 stk_args += 2; 1146 } 1147 break; 1148 case T_VOID: // Halves of longs and doubles 1149 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 1150 regs[i].set_bad(); 1151 break; 1152 default: 1153 ShouldNotReachHere(); 1154 break; 1155 } 1156 } 1157 #ifdef _WIN64 1158 // windows abi requires that we always allocate enough stack space 1159 // for 4 64bit registers to be stored down. 1160 if (stk_args < 8) { 1161 stk_args = 8; 1162 } 1163 #endif // _WIN64 1164 1165 return stk_args; 1166 } 1167 1168 int SharedRuntime::vector_calling_convention(VMRegPair *regs, 1169 uint num_bits, 1170 uint total_args_passed) { 1171 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512, 1172 "only certain vector sizes are supported for now"); 1173 1174 static const XMMRegister VEC_ArgReg[32] = { 1175 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 1176 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1177 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23, 1178 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31 1179 }; 1180 1181 uint stk_args = 0; 1182 uint fp_args = 0; 1183 1184 for (uint i = 0; i < total_args_passed; i++) { 1185 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg(); 1186 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15)); 1187 regs[i].set_pair(vmreg->next(next_val), vmreg); 1188 } 1189 1190 return stk_args; 1191 } 1192 1193 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1194 // We always ignore the frame_slots arg and just use the space just below frame pointer 1195 // which by this time is free to use 1196 switch (ret_type) { 1197 case T_FLOAT: 1198 __ movflt(Address(rbp, -wordSize), xmm0); 1199 break; 1200 case T_DOUBLE: 1201 __ movdbl(Address(rbp, -wordSize), xmm0); 1202 break; 1203 case T_VOID: break; 1204 default: { 1205 __ movptr(Address(rbp, -wordSize), rax); 1206 } 1207 } 1208 } 1209 1210 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1211 // We always ignore the frame_slots arg and just use the space just below frame pointer 1212 // which by this time is free to use 1213 switch (ret_type) { 1214 case T_FLOAT: 1215 __ movflt(xmm0, Address(rbp, -wordSize)); 1216 break; 1217 case T_DOUBLE: 1218 __ movdbl(xmm0, Address(rbp, -wordSize)); 1219 break; 1220 case T_VOID: break; 1221 default: { 1222 __ movptr(rax, Address(rbp, -wordSize)); 1223 } 1224 } 1225 } 1226 1227 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1228 for ( int i = first_arg ; i < arg_count ; i++ ) { 1229 if (args[i].first()->is_Register()) { 1230 __ push(args[i].first()->as_Register()); 1231 } else if (args[i].first()->is_XMMRegister()) { 1232 __ subptr(rsp, 2*wordSize); 1233 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister()); 1234 } 1235 } 1236 } 1237 1238 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1239 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) { 1240 if (args[i].first()->is_Register()) { 1241 __ pop(args[i].first()->as_Register()); 1242 } else if (args[i].first()->is_XMMRegister()) { 1243 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0)); 1244 __ addptr(rsp, 2*wordSize); 1245 } 1246 } 1247 } 1248 1249 static void verify_oop_args(MacroAssembler* masm, 1250 const methodHandle& method, 1251 const BasicType* sig_bt, 1252 const VMRegPair* regs) { 1253 Register temp_reg = rbx; // not part of any compiled calling seq 1254 if (VerifyOops) { 1255 for (int i = 0; i < method->size_of_parameters(); i++) { 1256 if (is_reference_type(sig_bt[i])) { 1257 VMReg r = regs[i].first(); 1258 assert(r->is_valid(), "bad oop arg"); 1259 if (r->is_stack()) { 1260 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1261 __ verify_oop(temp_reg); 1262 } else { 1263 __ verify_oop(r->as_Register()); 1264 } 1265 } 1266 } 1267 } 1268 } 1269 1270 static void check_continuation_enter_argument(VMReg actual_vmreg, 1271 Register expected_reg, 1272 const char* name) { 1273 assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name); 1274 assert(actual_vmreg->as_Register() == expected_reg, 1275 "%s is in unexpected register: %s instead of %s", 1276 name, actual_vmreg->as_Register()->name(), expected_reg->name()); 1277 } 1278 1279 1280 //---------------------------- continuation_enter_setup --------------------------- 1281 // 1282 // Arguments: 1283 // None. 1284 // 1285 // Results: 1286 // rsp: pointer to blank ContinuationEntry 1287 // 1288 // Kills: 1289 // rax 1290 // 1291 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) { 1292 assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, ""); 1293 assert(in_bytes(ContinuationEntry::cont_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1294 assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1295 1296 stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize; 1297 __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1298 1299 int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size; 1300 OopMap* map = new OopMap(frame_size, 0); 1301 1302 __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset())); 1303 __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax); 1304 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp); 1305 1306 return map; 1307 } 1308 1309 //---------------------------- fill_continuation_entry --------------------------- 1310 // 1311 // Arguments: 1312 // rsp: pointer to blank Continuation entry 1313 // reg_cont_obj: pointer to the continuation 1314 // reg_flags: flags 1315 // 1316 // Results: 1317 // rsp: pointer to filled out ContinuationEntry 1318 // 1319 // Kills: 1320 // rax 1321 // 1322 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) { 1323 assert_different_registers(rax, reg_cont_obj, reg_flags); 1324 #ifdef ASSERT 1325 __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value()); 1326 #endif 1327 __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj); 1328 __ movl (Address(rsp, ContinuationEntry::flags_offset()), reg_flags); 1329 __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0); 1330 __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0); 1331 __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0); 1332 1333 __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset())); 1334 __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax); 1335 __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset())); 1336 __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax); 1337 1338 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0); 1339 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0); 1340 } 1341 1342 //---------------------------- continuation_enter_cleanup --------------------------- 1343 // 1344 // Arguments: 1345 // rsp: pointer to the ContinuationEntry 1346 // 1347 // Results: 1348 // rsp: pointer to the spilled rbp in the entry frame 1349 // 1350 // Kills: 1351 // rbx 1352 // 1353 void static continuation_enter_cleanup(MacroAssembler* masm) { 1354 #ifdef ASSERT 1355 Label L_good_sp; 1356 __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1357 __ jcc(Assembler::equal, L_good_sp); 1358 __ stop("Incorrect rsp at continuation_enter_cleanup"); 1359 __ bind(L_good_sp); 1360 #endif 1361 1362 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset())); 1363 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx); 1364 __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset())); 1365 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx); 1366 1367 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset())); 1368 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx); 1369 __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1370 } 1371 1372 static void gen_continuation_enter(MacroAssembler* masm, 1373 const VMRegPair* regs, 1374 int& exception_offset, 1375 OopMapSet* oop_maps, 1376 int& frame_complete, 1377 int& stack_slots, 1378 int& interpreted_entry_offset, 1379 int& compiled_entry_offset) { 1380 1381 // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread) 1382 int pos_cont_obj = 0; 1383 int pos_is_cont = 1; 1384 int pos_is_virtual = 2; 1385 1386 // The platform-specific calling convention may present the arguments in various registers. 1387 // To simplify the rest of the code, we expect the arguments to reside at these known 1388 // registers, and we additionally check the placement here in case calling convention ever 1389 // changes. 1390 Register reg_cont_obj = c_rarg1; 1391 Register reg_is_cont = c_rarg2; 1392 Register reg_is_virtual = c_rarg3; 1393 1394 check_continuation_enter_argument(regs[pos_cont_obj].first(), reg_cont_obj, "Continuation object"); 1395 check_continuation_enter_argument(regs[pos_is_cont].first(), reg_is_cont, "isContinue"); 1396 check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread"); 1397 1398 // Utility methods kill rax, make sure there are no collisions 1399 assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual); 1400 1401 AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(), 1402 relocInfo::static_call_type); 1403 1404 address start = __ pc(); 1405 1406 Label L_thaw, L_exit; 1407 1408 // i2i entry used at interp_only_mode only 1409 interpreted_entry_offset = __ pc() - start; 1410 { 1411 #ifdef ASSERT 1412 Label is_interp_only; 1413 __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0); 1414 __ jcc(Assembler::notEqual, is_interp_only); 1415 __ stop("enterSpecial interpreter entry called when not in interp_only_mode"); 1416 __ bind(is_interp_only); 1417 #endif 1418 1419 __ pop(rax); // return address 1420 // Read interpreter arguments into registers (this is an ad-hoc i2c adapter) 1421 __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2)); 1422 __ movl(c_rarg2, Address(rsp, Interpreter::stackElementSize*1)); 1423 __ movl(c_rarg3, Address(rsp, Interpreter::stackElementSize*0)); 1424 __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment 1425 __ push(rax); // return address 1426 __ push_cont_fastpath(); 1427 1428 __ enter(); 1429 1430 stack_slots = 2; // will be adjusted in setup 1431 OopMap* map = continuation_enter_setup(masm, stack_slots); 1432 // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe, 1433 // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway. 1434 1435 __ verify_oop(reg_cont_obj); 1436 1437 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1438 1439 // If continuation, call to thaw. Otherwise, resolve the call and exit. 1440 __ testptr(reg_is_cont, reg_is_cont); 1441 __ jcc(Assembler::notZero, L_thaw); 1442 1443 // --- Resolve path 1444 1445 // Make sure the call is patchable 1446 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1447 // Emit stub for static call 1448 CodeBuffer* cbuf = masm->code_section()->outer(); 1449 address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, __ pc()); 1450 if (stub == nullptr) { 1451 fatal("CodeCache is full at gen_continuation_enter"); 1452 } 1453 __ call(resolve); 1454 oop_maps->add_gc_map(__ pc() - start, map); 1455 __ post_call_nop(); 1456 1457 __ jmp(L_exit); 1458 } 1459 1460 // compiled entry 1461 __ align(CodeEntryAlignment); 1462 compiled_entry_offset = __ pc() - start; 1463 __ enter(); 1464 1465 stack_slots = 2; // will be adjusted in setup 1466 OopMap* map = continuation_enter_setup(masm, stack_slots); 1467 1468 // Frame is now completed as far as size and linkage. 1469 frame_complete = __ pc() - start; 1470 1471 __ verify_oop(reg_cont_obj); 1472 1473 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1474 1475 // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue) 1476 __ testptr(reg_is_cont, reg_is_cont); 1477 __ jccb(Assembler::notZero, L_thaw); 1478 1479 // --- call Continuation.enter(Continuation c, boolean isContinue) 1480 1481 // Make sure the call is patchable 1482 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1483 1484 // Emit stub for static call 1485 CodeBuffer* cbuf = masm->code_section()->outer(); 1486 address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, __ pc()); 1487 if (stub == nullptr) { 1488 fatal("CodeCache is full at gen_continuation_enter"); 1489 } 1490 1491 // The call needs to be resolved. There's a special case for this in 1492 // SharedRuntime::find_callee_info_helper() which calls 1493 // LinkResolver::resolve_continuation_enter() which resolves the call to 1494 // Continuation.enter(Continuation c, boolean isContinue). 1495 __ call(resolve); 1496 1497 oop_maps->add_gc_map(__ pc() - start, map); 1498 __ post_call_nop(); 1499 1500 __ jmpb(L_exit); 1501 1502 // --- Thawing path 1503 1504 __ bind(L_thaw); 1505 1506 __ call(RuntimeAddress(StubRoutines::cont_thaw())); 1507 1508 ContinuationEntry::_return_pc_offset = __ pc() - start; 1509 oop_maps->add_gc_map(__ pc() - start, map->deep_copy()); 1510 __ post_call_nop(); 1511 1512 // --- Normal exit (resolve/thawing) 1513 1514 __ bind(L_exit); 1515 1516 continuation_enter_cleanup(masm); 1517 __ pop(rbp); 1518 __ ret(0); 1519 1520 // --- Exception handling path 1521 1522 exception_offset = __ pc() - start; 1523 1524 continuation_enter_cleanup(masm); 1525 __ pop(rbp); 1526 1527 __ movptr(c_rarg0, r15_thread); 1528 __ movptr(c_rarg1, Address(rsp, 0)); // return address 1529 1530 // rax still holds the original exception oop, save it before the call 1531 __ push(rax); 1532 1533 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2); 1534 __ movptr(rbx, rax); 1535 1536 // Continue at exception handler: 1537 // rax: exception oop 1538 // rbx: exception handler 1539 // rdx: exception pc 1540 __ pop(rax); 1541 __ verify_oop(rax); 1542 __ pop(rdx); 1543 __ jmp(rbx); 1544 } 1545 1546 static void gen_continuation_yield(MacroAssembler* masm, 1547 const VMRegPair* regs, 1548 OopMapSet* oop_maps, 1549 int& frame_complete, 1550 int& stack_slots, 1551 int& compiled_entry_offset) { 1552 enum layout { 1553 rbp_off, 1554 rbpH_off, 1555 return_off, 1556 return_off2, 1557 framesize // inclusive of return address 1558 }; 1559 stack_slots = framesize / VMRegImpl::slots_per_word; 1560 assert(stack_slots == 2, "recheck layout"); 1561 1562 address start = __ pc(); 1563 compiled_entry_offset = __ pc() - start; 1564 __ enter(); 1565 address the_pc = __ pc(); 1566 1567 frame_complete = the_pc - start; 1568 1569 // This nop must be exactly at the PC we push into the frame info. 1570 // We use this nop for fast CodeBlob lookup, associate the OopMap 1571 // with it right away. 1572 __ post_call_nop(); 1573 OopMap* map = new OopMap(framesize, 1); 1574 oop_maps->add_gc_map(frame_complete, map); 1575 1576 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 1577 __ movptr(c_rarg0, r15_thread); 1578 __ movptr(c_rarg1, rsp); 1579 __ call_VM_leaf(Continuation::freeze_entry(), 2); 1580 __ reset_last_Java_frame(true); 1581 1582 Label L_pinned; 1583 1584 __ testptr(rax, rax); 1585 __ jcc(Assembler::notZero, L_pinned); 1586 1587 __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1588 continuation_enter_cleanup(masm); 1589 __ pop(rbp); 1590 __ ret(0); 1591 1592 __ bind(L_pinned); 1593 1594 // Pinned, return to caller 1595 1596 // handle pending exception thrown by freeze 1597 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 1598 Label ok; 1599 __ jcc(Assembler::equal, ok); 1600 __ leave(); 1601 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 1602 __ bind(ok); 1603 1604 __ leave(); 1605 __ ret(0); 1606 } 1607 1608 static void gen_special_dispatch(MacroAssembler* masm, 1609 const methodHandle& method, 1610 const BasicType* sig_bt, 1611 const VMRegPair* regs) { 1612 verify_oop_args(masm, method, sig_bt, regs); 1613 vmIntrinsics::ID iid = method->intrinsic_id(); 1614 1615 // Now write the args into the outgoing interpreter space 1616 bool has_receiver = false; 1617 Register receiver_reg = noreg; 1618 int member_arg_pos = -1; 1619 Register member_reg = noreg; 1620 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid); 1621 if (ref_kind != 0) { 1622 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument 1623 member_reg = rbx; // known to be free at this point 1624 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind); 1625 } else if (iid == vmIntrinsics::_invokeBasic) { 1626 has_receiver = true; 1627 } else if (iid == vmIntrinsics::_linkToNative) { 1628 member_arg_pos = method->size_of_parameters() - 1; // trailing NativeEntryPoint argument 1629 member_reg = rbx; // known to be free at this point 1630 } else { 1631 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid)); 1632 } 1633 1634 if (member_reg != noreg) { 1635 // Load the member_arg into register, if necessary. 1636 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs); 1637 VMReg r = regs[member_arg_pos].first(); 1638 if (r->is_stack()) { 1639 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1640 } else { 1641 // no data motion is needed 1642 member_reg = r->as_Register(); 1643 } 1644 } 1645 1646 if (has_receiver) { 1647 // Make sure the receiver is loaded into a register. 1648 assert(method->size_of_parameters() > 0, "oob"); 1649 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object"); 1650 VMReg r = regs[0].first(); 1651 assert(r->is_valid(), "bad receiver arg"); 1652 if (r->is_stack()) { 1653 // Porting note: This assumes that compiled calling conventions always 1654 // pass the receiver oop in a register. If this is not true on some 1655 // platform, pick a temp and load the receiver from stack. 1656 fatal("receiver always in a register"); 1657 receiver_reg = j_rarg0; // known to be free at this point 1658 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1659 } else { 1660 // no data motion is needed 1661 receiver_reg = r->as_Register(); 1662 } 1663 } 1664 1665 // Figure out which address we are really jumping to: 1666 MethodHandles::generate_method_handle_dispatch(masm, iid, 1667 receiver_reg, member_reg, /*for_compiler_entry:*/ true); 1668 } 1669 1670 // --------------------------------------------------------------------------- 1671 // Generate a native wrapper for a given method. The method takes arguments 1672 // in the Java compiled code convention, marshals them to the native 1673 // convention (handlizes oops, etc), transitions to native, makes the call, 1674 // returns to java state (possibly blocking), unhandlizes any result and 1675 // returns. 1676 // 1677 // Critical native functions are a shorthand for the use of 1678 // GetPrimtiveArrayCritical and disallow the use of any other JNI 1679 // functions. The wrapper is expected to unpack the arguments before 1680 // passing them to the callee. Critical native functions leave the state _in_Java, 1681 // since they cannot stop for GC. 1682 // Some other parts of JNI setup are skipped like the tear down of the JNI handle 1683 // block and the check for pending exceptions it's impossible for them 1684 // to be thrown. 1685 // 1686 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm, 1687 const methodHandle& method, 1688 int compile_id, 1689 BasicType* in_sig_bt, 1690 VMRegPair* in_regs, 1691 BasicType ret_type) { 1692 if (method->is_continuation_native_intrinsic()) { 1693 int exception_offset = -1; 1694 OopMapSet* oop_maps = new OopMapSet(); 1695 int frame_complete = -1; 1696 int stack_slots = -1; 1697 int interpreted_entry_offset = -1; 1698 int vep_offset = -1; 1699 if (method->is_continuation_enter_intrinsic()) { 1700 gen_continuation_enter(masm, 1701 in_regs, 1702 exception_offset, 1703 oop_maps, 1704 frame_complete, 1705 stack_slots, 1706 interpreted_entry_offset, 1707 vep_offset); 1708 } else if (method->is_continuation_yield_intrinsic()) { 1709 gen_continuation_yield(masm, 1710 in_regs, 1711 oop_maps, 1712 frame_complete, 1713 stack_slots, 1714 vep_offset); 1715 } else { 1716 guarantee(false, "Unknown Continuation native intrinsic"); 1717 } 1718 1719 #ifdef ASSERT 1720 if (method->is_continuation_enter_intrinsic()) { 1721 assert(interpreted_entry_offset != -1, "Must be set"); 1722 assert(exception_offset != -1, "Must be set"); 1723 } else { 1724 assert(interpreted_entry_offset == -1, "Must be unset"); 1725 assert(exception_offset == -1, "Must be unset"); 1726 } 1727 assert(frame_complete != -1, "Must be set"); 1728 assert(stack_slots != -1, "Must be set"); 1729 assert(vep_offset != -1, "Must be set"); 1730 #endif 1731 1732 __ flush(); 1733 nmethod* nm = nmethod::new_native_nmethod(method, 1734 compile_id, 1735 masm->code(), 1736 vep_offset, 1737 frame_complete, 1738 stack_slots, 1739 in_ByteSize(-1), 1740 in_ByteSize(-1), 1741 oop_maps, 1742 exception_offset); 1743 if (method->is_continuation_enter_intrinsic()) { 1744 ContinuationEntry::set_enter_code(nm, interpreted_entry_offset); 1745 } else if (method->is_continuation_yield_intrinsic()) { 1746 _cont_doYield_stub = nm; 1747 } 1748 return nm; 1749 } 1750 1751 if (method->is_method_handle_intrinsic()) { 1752 vmIntrinsics::ID iid = method->intrinsic_id(); 1753 intptr_t start = (intptr_t)__ pc(); 1754 int vep_offset = ((intptr_t)__ pc()) - start; 1755 gen_special_dispatch(masm, 1756 method, 1757 in_sig_bt, 1758 in_regs); 1759 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period 1760 __ flush(); 1761 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually 1762 return nmethod::new_native_nmethod(method, 1763 compile_id, 1764 masm->code(), 1765 vep_offset, 1766 frame_complete, 1767 stack_slots / VMRegImpl::slots_per_word, 1768 in_ByteSize(-1), 1769 in_ByteSize(-1), 1770 nullptr); 1771 } 1772 address native_func = method->native_function(); 1773 assert(native_func != nullptr, "must have function"); 1774 1775 // An OopMap for lock (and class if static) 1776 OopMapSet *oop_maps = new OopMapSet(); 1777 intptr_t start = (intptr_t)__ pc(); 1778 1779 // We have received a description of where all the java arg are located 1780 // on entry to the wrapper. We need to convert these args to where 1781 // the jni function will expect them. To figure out where they go 1782 // we convert the java signature to a C signature by inserting 1783 // the hidden arguments as arg[0] and possibly arg[1] (static method) 1784 1785 const int total_in_args = method->size_of_parameters(); 1786 int total_c_args = total_in_args + (method->is_static() ? 2 : 1); 1787 1788 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args); 1789 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args); 1790 BasicType* in_elem_bt = nullptr; 1791 1792 int argc = 0; 1793 out_sig_bt[argc++] = T_ADDRESS; 1794 if (method->is_static()) { 1795 out_sig_bt[argc++] = T_OBJECT; 1796 } 1797 1798 for (int i = 0; i < total_in_args ; i++ ) { 1799 out_sig_bt[argc++] = in_sig_bt[i]; 1800 } 1801 1802 // Now figure out where the args must be stored and how much stack space 1803 // they require. 1804 int out_arg_slots; 1805 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args); 1806 1807 // Compute framesize for the wrapper. We need to handlize all oops in 1808 // incoming registers 1809 1810 // Calculate the total number of stack slots we will need. 1811 1812 // First count the abi requirement plus all of the outgoing args 1813 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots; 1814 1815 // Now the space for the inbound oop handle area 1816 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers 1817 1818 int oop_handle_offset = stack_slots; 1819 stack_slots += total_save_slots; 1820 1821 // Now any space we need for handlizing a klass if static method 1822 1823 int klass_slot_offset = 0; 1824 int klass_offset = -1; 1825 int lock_slot_offset = 0; 1826 bool is_static = false; 1827 1828 if (method->is_static()) { 1829 klass_slot_offset = stack_slots; 1830 stack_slots += VMRegImpl::slots_per_word; 1831 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size; 1832 is_static = true; 1833 } 1834 1835 // Plus a lock if needed 1836 1837 if (method->is_synchronized()) { 1838 lock_slot_offset = stack_slots; 1839 stack_slots += VMRegImpl::slots_per_word; 1840 } 1841 1842 // Now a place (+2) to save return values or temp during shuffling 1843 // + 4 for return address (which we own) and saved rbp 1844 stack_slots += 6; 1845 1846 // Ok The space we have allocated will look like: 1847 // 1848 // 1849 // FP-> | | 1850 // |---------------------| 1851 // | 2 slots for moves | 1852 // |---------------------| 1853 // | lock box (if sync) | 1854 // |---------------------| <- lock_slot_offset 1855 // | klass (if static) | 1856 // |---------------------| <- klass_slot_offset 1857 // | oopHandle area | 1858 // |---------------------| <- oop_handle_offset (6 java arg registers) 1859 // | outbound memory | 1860 // | based arguments | 1861 // | | 1862 // |---------------------| 1863 // | | 1864 // SP-> | out_preserved_slots | 1865 // 1866 // 1867 1868 1869 // Now compute actual number of stack words we need rounding to make 1870 // stack properly aligned. 1871 stack_slots = align_up(stack_slots, StackAlignmentInSlots); 1872 1873 int stack_size = stack_slots * VMRegImpl::stack_slot_size; 1874 1875 // First thing make an ic check to see if we should even be here 1876 1877 // We are free to use all registers as temps without saving them and 1878 // restoring them except rbp. rbp is the only callee save register 1879 // as far as the interpreter and the compiler(s) are concerned. 1880 1881 1882 const Register ic_reg = rax; 1883 const Register receiver = j_rarg0; 1884 1885 Label hit; 1886 Label exception_pending; 1887 1888 assert_different_registers(ic_reg, receiver, rscratch1, rscratch2); 1889 __ verify_oop(receiver); 1890 __ load_klass(rscratch1, receiver, rscratch2); 1891 __ cmpq(ic_reg, rscratch1); 1892 __ jcc(Assembler::equal, hit); 1893 1894 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1895 1896 // Verified entry point must be aligned 1897 __ align(8); 1898 1899 __ bind(hit); 1900 1901 int vep_offset = ((intptr_t)__ pc()) - start; 1902 1903 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) { 1904 Label L_skip_barrier; 1905 Register klass = r10; 1906 __ mov_metadata(klass, method->method_holder()); // InstanceKlass* 1907 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1908 1909 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1910 1911 __ bind(L_skip_barrier); 1912 } 1913 1914 #ifdef COMPILER1 1915 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available. 1916 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) { 1917 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/); 1918 } 1919 #endif // COMPILER1 1920 1921 // The instruction at the verified entry point must be 5 bytes or longer 1922 // because it can be patched on the fly by make_non_entrant. The stack bang 1923 // instruction fits that requirement. 1924 1925 // Generate stack overflow check 1926 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size()); 1927 1928 // Generate a new frame for the wrapper. 1929 __ enter(); 1930 // -2 because return address is already present and so is saved rbp 1931 __ subptr(rsp, stack_size - 2*wordSize); 1932 1933 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1934 // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub 1935 bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */); 1936 1937 // Frame is now completed as far as size and linkage. 1938 int frame_complete = ((intptr_t)__ pc()) - start; 1939 1940 if (UseRTMLocking) { 1941 // Abort RTM transaction before calling JNI 1942 // because critical section will be large and will be 1943 // aborted anyway. Also nmethod could be deoptimized. 1944 __ xabort(0); 1945 } 1946 1947 #ifdef ASSERT 1948 __ check_stack_alignment(rsp, "improperly aligned stack"); 1949 #endif /* ASSERT */ 1950 1951 1952 // We use r14 as the oop handle for the receiver/klass 1953 // It is callee save so it survives the call to native 1954 1955 const Register oop_handle_reg = r14; 1956 1957 // 1958 // We immediately shuffle the arguments so that any vm call we have to 1959 // make from here on out (sync slow path, jvmti, etc.) we will have 1960 // captured the oops from our caller and have a valid oopMap for 1961 // them. 1962 1963 // ----------------- 1964 // The Grand Shuffle 1965 1966 // The Java calling convention is either equal (linux) or denser (win64) than the 1967 // c calling convention. However the because of the jni_env argument the c calling 1968 // convention always has at least one more (and two for static) arguments than Java. 1969 // Therefore if we move the args from java -> c backwards then we will never have 1970 // a register->register conflict and we don't have to build a dependency graph 1971 // and figure out how to break any cycles. 1972 // 1973 1974 // Record esp-based slot for receiver on stack for non-static methods 1975 int receiver_offset = -1; 1976 1977 // This is a trick. We double the stack slots so we can claim 1978 // the oops in the caller's frame. Since we are sure to have 1979 // more args than the caller doubling is enough to make 1980 // sure we can capture all the incoming oop args from the 1981 // caller. 1982 // 1983 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/); 1984 1985 // Mark location of rbp (someday) 1986 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp)); 1987 1988 // Use eax, ebx as temporaries during any memory-memory moves we have to do 1989 // All inbound args are referenced based on rbp and all outbound args via rsp. 1990 1991 1992 #ifdef ASSERT 1993 bool reg_destroyed[Register::number_of_registers]; 1994 bool freg_destroyed[XMMRegister::number_of_registers]; 1995 for ( int r = 0 ; r < Register::number_of_registers ; r++ ) { 1996 reg_destroyed[r] = false; 1997 } 1998 for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) { 1999 freg_destroyed[f] = false; 2000 } 2001 2002 #endif /* ASSERT */ 2003 2004 // For JNI natives the incoming and outgoing registers are offset upwards. 2005 GrowableArray<int> arg_order(2 * total_in_args); 2006 2007 VMRegPair tmp_vmreg; 2008 tmp_vmreg.set2(rbx->as_VMReg()); 2009 2010 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 2011 arg_order.push(i); 2012 arg_order.push(c_arg); 2013 } 2014 2015 int temploc = -1; 2016 for (int ai = 0; ai < arg_order.length(); ai += 2) { 2017 int i = arg_order.at(ai); 2018 int c_arg = arg_order.at(ai + 1); 2019 __ block_comment(err_msg("move %d -> %d", i, c_arg)); 2020 #ifdef ASSERT 2021 if (in_regs[i].first()->is_Register()) { 2022 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!"); 2023 } else if (in_regs[i].first()->is_XMMRegister()) { 2024 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!"); 2025 } 2026 if (out_regs[c_arg].first()->is_Register()) { 2027 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 2028 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 2029 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 2030 } 2031 #endif /* ASSERT */ 2032 switch (in_sig_bt[i]) { 2033 case T_ARRAY: 2034 case T_OBJECT: 2035 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg], 2036 ((i == 0) && (!is_static)), 2037 &receiver_offset); 2038 break; 2039 case T_VOID: 2040 break; 2041 2042 case T_FLOAT: 2043 __ float_move(in_regs[i], out_regs[c_arg]); 2044 break; 2045 2046 case T_DOUBLE: 2047 assert( i + 1 < total_in_args && 2048 in_sig_bt[i + 1] == T_VOID && 2049 out_sig_bt[c_arg+1] == T_VOID, "bad arg list"); 2050 __ double_move(in_regs[i], out_regs[c_arg]); 2051 break; 2052 2053 case T_LONG : 2054 __ long_move(in_regs[i], out_regs[c_arg]); 2055 break; 2056 2057 case T_ADDRESS: assert(false, "found T_ADDRESS in java args"); 2058 2059 default: 2060 __ move32_64(in_regs[i], out_regs[c_arg]); 2061 } 2062 } 2063 2064 int c_arg; 2065 2066 // Pre-load a static method's oop into r14. Used both by locking code and 2067 // the normal JNI call code. 2068 // point c_arg at the first arg that is already loaded in case we 2069 // need to spill before we call out 2070 c_arg = total_c_args - total_in_args; 2071 2072 if (method->is_static()) { 2073 2074 // load oop into a register 2075 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror())); 2076 2077 // Now handlize the static class mirror it's known not-null. 2078 __ movptr(Address(rsp, klass_offset), oop_handle_reg); 2079 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset)); 2080 2081 // Now get the handle 2082 __ lea(oop_handle_reg, Address(rsp, klass_offset)); 2083 // store the klass handle as second argument 2084 __ movptr(c_rarg1, oop_handle_reg); 2085 // and protect the arg if we must spill 2086 c_arg--; 2087 } 2088 2089 // Change state to native (we save the return address in the thread, since it might not 2090 // be pushed on the stack when we do a stack traversal). It is enough that the pc() 2091 // points into the right code segment. It does not have to be the correct return pc. 2092 // We use the same pc/oopMap repeatedly when we call out 2093 2094 intptr_t the_pc = (intptr_t) __ pc(); 2095 oop_maps->add_gc_map(the_pc - start, map); 2096 2097 __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1); 2098 2099 2100 // We have all of the arguments setup at this point. We must not touch any register 2101 // argument registers at this point (what if we save/restore them there are no oop? 2102 2103 { 2104 SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1); 2105 // protect the args we've loaded 2106 save_args(masm, total_c_args, c_arg, out_regs); 2107 __ mov_metadata(c_rarg1, method()); 2108 __ call_VM_leaf( 2109 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry), 2110 r15_thread, c_rarg1); 2111 restore_args(masm, total_c_args, c_arg, out_regs); 2112 } 2113 2114 // RedefineClasses() tracing support for obsolete method entry 2115 if (log_is_enabled(Trace, redefine, class, obsolete)) { 2116 // protect the args we've loaded 2117 save_args(masm, total_c_args, c_arg, out_regs); 2118 __ mov_metadata(c_rarg1, method()); 2119 __ call_VM_leaf( 2120 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry), 2121 r15_thread, c_rarg1); 2122 restore_args(masm, total_c_args, c_arg, out_regs); 2123 } 2124 2125 // Lock a synchronized method 2126 2127 // Register definitions used by locking and unlocking 2128 2129 const Register swap_reg = rax; // Must use rax for cmpxchg instruction 2130 const Register obj_reg = rbx; // Will contain the oop 2131 const Register lock_reg = r13; // Address of compiler lock object (BasicLock) 2132 const Register old_hdr = r13; // value of old header at unlock time 2133 2134 Label slow_path_lock; 2135 Label lock_done; 2136 2137 if (method->is_synchronized()) { 2138 Label count_mon; 2139 2140 const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes(); 2141 2142 // Get the handle (the 2nd argument) 2143 __ mov(oop_handle_reg, c_rarg1); 2144 2145 // Get address of the box 2146 2147 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2148 2149 // Load the oop from the handle 2150 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2151 2152 if (LockingMode == LM_MONITOR) { 2153 __ jmp(slow_path_lock); 2154 } else if (LockingMode == LM_LEGACY) { 2155 // Load immediate 1 into swap_reg %rax 2156 __ movl(swap_reg, 1); 2157 2158 // Load (object->mark() | 1) into swap_reg %rax 2159 __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2160 2161 // Save (object->mark() | 1) into BasicLock's displaced header 2162 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2163 2164 // src -> dest iff dest == rax else rax <- dest 2165 __ lock(); 2166 __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2167 __ jcc(Assembler::equal, count_mon); 2168 2169 // Hmm should this move to the slow path code area??? 2170 2171 // Test if the oopMark is an obvious stack pointer, i.e., 2172 // 1) (mark & 3) == 0, and 2173 // 2) rsp <= mark < mark + os::pagesize() 2174 // These 3 tests can be done by evaluating the following 2175 // expression: ((mark - rsp) & (3 - os::vm_page_size())), 2176 // assuming both stack pointer and pagesize have their 2177 // least significant 2 bits clear. 2178 // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg 2179 2180 __ subptr(swap_reg, rsp); 2181 __ andptr(swap_reg, 3 - (int)os::vm_page_size()); 2182 2183 // Save the test result, for recursive case, the result is zero 2184 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2185 __ jcc(Assembler::notEqual, slow_path_lock); 2186 } else { 2187 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2188 // Load object header 2189 __ movptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2190 __ lightweight_lock(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock); 2191 } 2192 __ bind(count_mon); 2193 __ inc_held_monitor_count(); 2194 2195 // Slow path will re-enter here 2196 __ bind(lock_done); 2197 } 2198 2199 // Finally just about ready to make the JNI call 2200 2201 // get JNIEnv* which is first argument to native 2202 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset()))); 2203 2204 // Now set thread in native 2205 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 2206 2207 __ call(RuntimeAddress(native_func)); 2208 2209 // Verify or restore cpu control state after JNI call 2210 __ restore_cpu_control_state_after_jni(rscratch1); 2211 2212 // Unpack native results. 2213 switch (ret_type) { 2214 case T_BOOLEAN: __ c2bool(rax); break; 2215 case T_CHAR : __ movzwl(rax, rax); break; 2216 case T_BYTE : __ sign_extend_byte (rax); break; 2217 case T_SHORT : __ sign_extend_short(rax); break; 2218 case T_INT : /* nothing to do */ break; 2219 case T_DOUBLE : 2220 case T_FLOAT : 2221 // Result is in xmm0 we'll save as needed 2222 break; 2223 case T_ARRAY: // Really a handle 2224 case T_OBJECT: // Really a handle 2225 break; // can't de-handlize until after safepoint check 2226 case T_VOID: break; 2227 case T_LONG: break; 2228 default : ShouldNotReachHere(); 2229 } 2230 2231 Label after_transition; 2232 2233 // Switch thread to "native transition" state before reading the synchronization state. 2234 // This additional state is necessary because reading and testing the synchronization 2235 // state is not atomic w.r.t. GC, as this scenario demonstrates: 2236 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted. 2237 // VM thread changes sync state to synchronizing and suspends threads for GC. 2238 // Thread A is resumed to finish this native method, but doesn't block here since it 2239 // didn't see any synchronization is progress, and escapes. 2240 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 2241 2242 // Force this write out before the read below 2243 if (!UseSystemMemoryBarrier) { 2244 __ membar(Assembler::Membar_mask_bits( 2245 Assembler::LoadLoad | Assembler::LoadStore | 2246 Assembler::StoreLoad | Assembler::StoreStore)); 2247 } 2248 2249 // check for safepoint operation in progress and/or pending suspend requests 2250 { 2251 Label Continue; 2252 Label slow_path; 2253 2254 __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */); 2255 2256 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2257 __ jcc(Assembler::equal, Continue); 2258 __ bind(slow_path); 2259 2260 // Don't use call_VM as it will see a possible pending exception and forward it 2261 // and never return here preventing us from clearing _last_native_pc down below. 2262 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are 2263 // preserved and correspond to the bcp/locals pointers. So we do a runtime call 2264 // by hand. 2265 // 2266 __ vzeroupper(); 2267 save_native_result(masm, ret_type, stack_slots); 2268 __ mov(c_rarg0, r15_thread); 2269 __ mov(r12, rsp); // remember sp 2270 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2271 __ andptr(rsp, -16); // align stack as required by ABI 2272 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 2273 __ mov(rsp, r12); // restore sp 2274 __ reinit_heapbase(); 2275 // Restore any method result value 2276 restore_native_result(masm, ret_type, stack_slots); 2277 __ bind(Continue); 2278 } 2279 2280 // change thread state 2281 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 2282 __ bind(after_transition); 2283 2284 Label reguard; 2285 Label reguard_done; 2286 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 2287 __ jcc(Assembler::equal, reguard); 2288 __ bind(reguard_done); 2289 2290 // native result if any is live 2291 2292 // Unlock 2293 Label slow_path_unlock; 2294 Label unlock_done; 2295 if (method->is_synchronized()) { 2296 2297 Label fast_done; 2298 2299 // Get locked oop from the handle we passed to jni 2300 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2301 2302 if (LockingMode == LM_LEGACY) { 2303 Label not_recur; 2304 // Simple recursive lock? 2305 __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD); 2306 __ jcc(Assembler::notEqual, not_recur); 2307 __ dec_held_monitor_count(); 2308 __ jmpb(fast_done); 2309 __ bind(not_recur); 2310 } 2311 2312 // Must save rax if it is live now because cmpxchg must use it 2313 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2314 save_native_result(masm, ret_type, stack_slots); 2315 } 2316 2317 if (LockingMode == LM_MONITOR) { 2318 __ jmp(slow_path_unlock); 2319 } else if (LockingMode == LM_LEGACY) { 2320 // get address of the stack lock 2321 __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2322 // get old displaced header 2323 __ movptr(old_hdr, Address(rax, 0)); 2324 2325 // Atomic swap old header if oop still contains the stack lock 2326 __ lock(); 2327 __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2328 __ jcc(Assembler::notEqual, slow_path_unlock); 2329 __ dec_held_monitor_count(); 2330 } else { 2331 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2332 __ movptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2333 __ andptr(swap_reg, ~(int32_t)markWord::lock_mask_in_place); 2334 __ lightweight_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock); 2335 __ dec_held_monitor_count(); 2336 } 2337 2338 // slow path re-enters here 2339 __ bind(unlock_done); 2340 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2341 restore_native_result(masm, ret_type, stack_slots); 2342 } 2343 2344 __ bind(fast_done); 2345 } 2346 { 2347 SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1); 2348 save_native_result(masm, ret_type, stack_slots); 2349 __ mov_metadata(c_rarg1, method()); 2350 __ call_VM_leaf( 2351 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit), 2352 r15_thread, c_rarg1); 2353 restore_native_result(masm, ret_type, stack_slots); 2354 } 2355 2356 __ reset_last_Java_frame(false); 2357 2358 // Unbox oop result, e.g. JNIHandles::resolve value. 2359 if (is_reference_type(ret_type)) { 2360 __ resolve_jobject(rax /* value */, 2361 r15_thread /* thread */, 2362 rcx /* tmp */); 2363 } 2364 2365 if (CheckJNICalls) { 2366 // clear_pending_jni_exception_check 2367 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD); 2368 } 2369 2370 // reset handle block 2371 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset())); 2372 __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD); 2373 2374 // pop our frame 2375 2376 __ leave(); 2377 2378 // Any exception pending? 2379 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2380 __ jcc(Assembler::notEqual, exception_pending); 2381 2382 // Return 2383 2384 __ ret(0); 2385 2386 // Unexpected paths are out of line and go here 2387 2388 // forward the exception 2389 __ bind(exception_pending); 2390 2391 // and forward the exception 2392 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 2393 2394 // Slow path locking & unlocking 2395 if (method->is_synchronized()) { 2396 2397 // BEGIN Slow path lock 2398 __ bind(slow_path_lock); 2399 2400 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM 2401 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2402 2403 // protect the args we've loaded 2404 save_args(masm, total_c_args, c_arg, out_regs); 2405 2406 __ mov(c_rarg0, obj_reg); 2407 __ mov(c_rarg1, lock_reg); 2408 __ mov(c_rarg2, r15_thread); 2409 2410 // Not a leaf but we have last_Java_frame setup as we want 2411 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3); 2412 restore_args(masm, total_c_args, c_arg, out_regs); 2413 2414 #ifdef ASSERT 2415 { Label L; 2416 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2417 __ jcc(Assembler::equal, L); 2418 __ stop("no pending exception allowed on exit from monitorenter"); 2419 __ bind(L); 2420 } 2421 #endif 2422 __ jmp(lock_done); 2423 2424 // END Slow path lock 2425 2426 // BEGIN Slow path unlock 2427 __ bind(slow_path_unlock); 2428 2429 // If we haven't already saved the native result we must save it now as xmm registers 2430 // are still exposed. 2431 __ vzeroupper(); 2432 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2433 save_native_result(masm, ret_type, stack_slots); 2434 } 2435 2436 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2437 2438 __ mov(c_rarg0, obj_reg); 2439 __ mov(c_rarg2, r15_thread); 2440 __ mov(r12, rsp); // remember sp 2441 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2442 __ andptr(rsp, -16); // align stack as required by ABI 2443 2444 // Save pending exception around call to VM (which contains an EXCEPTION_MARK) 2445 // NOTE that obj_reg == rbx currently 2446 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset()))); 2447 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2448 2449 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2450 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C))); 2451 __ mov(rsp, r12); // restore sp 2452 __ reinit_heapbase(); 2453 #ifdef ASSERT 2454 { 2455 Label L; 2456 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2457 __ jcc(Assembler::equal, L); 2458 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C"); 2459 __ bind(L); 2460 } 2461 #endif /* ASSERT */ 2462 2463 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx); 2464 2465 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2466 restore_native_result(masm, ret_type, stack_slots); 2467 } 2468 __ jmp(unlock_done); 2469 2470 // END Slow path unlock 2471 2472 } // synchronized 2473 2474 // SLOW PATH Reguard the stack if needed 2475 2476 __ bind(reguard); 2477 __ vzeroupper(); 2478 save_native_result(masm, ret_type, stack_slots); 2479 __ mov(r12, rsp); // remember sp 2480 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2481 __ andptr(rsp, -16); // align stack as required by ABI 2482 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 2483 __ mov(rsp, r12); // restore sp 2484 __ reinit_heapbase(); 2485 restore_native_result(masm, ret_type, stack_slots); 2486 // and continue 2487 __ jmp(reguard_done); 2488 2489 2490 2491 __ flush(); 2492 2493 nmethod *nm = nmethod::new_native_nmethod(method, 2494 compile_id, 2495 masm->code(), 2496 vep_offset, 2497 frame_complete, 2498 stack_slots / VMRegImpl::slots_per_word, 2499 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)), 2500 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size), 2501 oop_maps); 2502 2503 return nm; 2504 } 2505 2506 // this function returns the adjust size (in number of words) to a c2i adapter 2507 // activation for use during deoptimization 2508 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) { 2509 return (callee_locals - callee_parameters) * Interpreter::stackElementWords; 2510 } 2511 2512 2513 uint SharedRuntime::out_preserve_stack_slots() { 2514 return 0; 2515 } 2516 2517 2518 // Number of stack slots between incoming argument block and the start of 2519 // a new frame. The PROLOG must add this many slots to the stack. The 2520 // EPILOG must remove this many slots. amd64 needs two slots for 2521 // return address. 2522 uint SharedRuntime::in_preserve_stack_slots() { 2523 return 4 + 2 * VerifyStackAtCalls; 2524 } 2525 2526 //------------------------------generate_deopt_blob---------------------------- 2527 void SharedRuntime::generate_deopt_blob() { 2528 // Allocate space for the code 2529 ResourceMark rm; 2530 // Setup code generation tools 2531 int pad = 0; 2532 if (UseAVX > 2) { 2533 pad += 1024; 2534 } 2535 #if INCLUDE_JVMCI 2536 if (EnableJVMCI) { 2537 pad += 512; // Increase the buffer size when compiling for JVMCI 2538 } 2539 #endif 2540 CodeBuffer buffer("deopt_blob", 2560+pad, 1024); 2541 MacroAssembler* masm = new MacroAssembler(&buffer); 2542 int frame_size_in_words; 2543 OopMap* map = nullptr; 2544 OopMapSet *oop_maps = new OopMapSet(); 2545 2546 // ------------- 2547 // This code enters when returning to a de-optimized nmethod. A return 2548 // address has been pushed on the stack, and return values are in 2549 // registers. 2550 // If we are doing a normal deopt then we were called from the patched 2551 // nmethod from the point we returned to the nmethod. So the return 2552 // address on the stack is wrong by NativeCall::instruction_size 2553 // We will adjust the value so it looks like we have the original return 2554 // address on the stack (like when we eagerly deoptimized). 2555 // In the case of an exception pending when deoptimizing, we enter 2556 // with a return address on the stack that points after the call we patched 2557 // into the exception handler. We have the following register state from, 2558 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp). 2559 // rax: exception oop 2560 // rbx: exception handler 2561 // rdx: throwing pc 2562 // So in this case we simply jam rdx into the useless return address and 2563 // the stack looks just like we want. 2564 // 2565 // At this point we need to de-opt. We save the argument return 2566 // registers. We call the first C routine, fetch_unroll_info(). This 2567 // routine captures the return values and returns a structure which 2568 // describes the current frame size and the sizes of all replacement frames. 2569 // The current frame is compiled code and may contain many inlined 2570 // functions, each with their own JVM state. We pop the current frame, then 2571 // push all the new frames. Then we call the C routine unpack_frames() to 2572 // populate these frames. Finally unpack_frames() returns us the new target 2573 // address. Notice that callee-save registers are BLOWN here; they have 2574 // already been captured in the vframeArray at the time the return PC was 2575 // patched. 2576 address start = __ pc(); 2577 Label cont; 2578 2579 // Prolog for non exception case! 2580 2581 // Save everything in sight. 2582 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2583 2584 // Normal deoptimization. Save exec mode for unpack_frames. 2585 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved 2586 __ jmp(cont); 2587 2588 int reexecute_offset = __ pc() - start; 2589 #if INCLUDE_JVMCI && !defined(COMPILER1) 2590 if (EnableJVMCI && UseJVMCICompiler) { 2591 // JVMCI does not use this kind of deoptimization 2592 __ should_not_reach_here(); 2593 } 2594 #endif 2595 2596 // Reexecute case 2597 // return address is the pc describes what bci to do re-execute at 2598 2599 // No need to update map as each call to save_live_registers will produce identical oopmap 2600 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2601 2602 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved 2603 __ jmp(cont); 2604 2605 #if INCLUDE_JVMCI 2606 Label after_fetch_unroll_info_call; 2607 int implicit_exception_uncommon_trap_offset = 0; 2608 int uncommon_trap_offset = 0; 2609 2610 if (EnableJVMCI) { 2611 implicit_exception_uncommon_trap_offset = __ pc() - start; 2612 2613 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset()))); 2614 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD); 2615 2616 uncommon_trap_offset = __ pc() - start; 2617 2618 // Save everything in sight. 2619 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2620 // fetch_unroll_info needs to call last_java_frame() 2621 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2622 2623 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset()))); 2624 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1); 2625 2626 __ movl(r14, Deoptimization::Unpack_reexecute); 2627 __ mov(c_rarg0, r15_thread); 2628 __ movl(c_rarg2, r14); // exec mode 2629 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2630 oop_maps->add_gc_map( __ pc()-start, map->deep_copy()); 2631 2632 __ reset_last_Java_frame(false); 2633 2634 __ jmp(after_fetch_unroll_info_call); 2635 } // EnableJVMCI 2636 #endif // INCLUDE_JVMCI 2637 2638 int exception_offset = __ pc() - start; 2639 2640 // Prolog for exception case 2641 2642 // all registers are dead at this entry point, except for rax, and 2643 // rdx which contain the exception oop and exception pc 2644 // respectively. Set them in TLS and fall thru to the 2645 // unpack_with_exception_in_tls entry point. 2646 2647 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 2648 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax); 2649 2650 int exception_in_tls_offset = __ pc() - start; 2651 2652 // new implementation because exception oop is now passed in JavaThread 2653 2654 // Prolog for exception case 2655 // All registers must be preserved because they might be used by LinearScan 2656 // Exceptiop oop and throwing PC are passed in JavaThread 2657 // tos: stack at point of call to method that threw the exception (i.e. only 2658 // args are on the stack, no return address) 2659 2660 // make room on stack for the return address 2661 // It will be patched later with the throwing pc. The correct value is not 2662 // available now because loading it from memory would destroy registers. 2663 __ push(0); 2664 2665 // Save everything in sight. 2666 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2667 2668 // Now it is safe to overwrite any register 2669 2670 // Deopt during an exception. Save exec mode for unpack_frames. 2671 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved 2672 2673 // load throwing pc from JavaThread and patch it as the return address 2674 // of the current frame. Then clear the field in JavaThread 2675 2676 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2677 __ movptr(Address(rbp, wordSize), rdx); 2678 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2679 2680 #ifdef ASSERT 2681 // verify that there is really an exception oop in JavaThread 2682 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2683 __ verify_oop(rax); 2684 2685 // verify that there is no pending exception 2686 Label no_pending_exception; 2687 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 2688 __ testptr(rax, rax); 2689 __ jcc(Assembler::zero, no_pending_exception); 2690 __ stop("must not have pending exception here"); 2691 __ bind(no_pending_exception); 2692 #endif 2693 2694 __ bind(cont); 2695 2696 // Call C code. Need thread and this frame, but NOT official VM entry 2697 // crud. We cannot block on this call, no GC can happen. 2698 // 2699 // UnrollBlock* fetch_unroll_info(JavaThread* thread) 2700 2701 // fetch_unroll_info needs to call last_java_frame(). 2702 2703 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2704 #ifdef ASSERT 2705 { Label L; 2706 __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD); 2707 __ jcc(Assembler::equal, L); 2708 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared"); 2709 __ bind(L); 2710 } 2711 #endif // ASSERT 2712 __ mov(c_rarg0, r15_thread); 2713 __ movl(c_rarg1, r14); // exec_mode 2714 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info))); 2715 2716 // Need to have an oopmap that tells fetch_unroll_info where to 2717 // find any register it might need. 2718 oop_maps->add_gc_map(__ pc() - start, map); 2719 2720 __ reset_last_Java_frame(false); 2721 2722 #if INCLUDE_JVMCI 2723 if (EnableJVMCI) { 2724 __ bind(after_fetch_unroll_info_call); 2725 } 2726 #endif 2727 2728 // Load UnrollBlock* into rdi 2729 __ mov(rdi, rax); 2730 2731 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset())); 2732 Label noException; 2733 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending? 2734 __ jcc(Assembler::notEqual, noException); 2735 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2736 // QQQ this is useless it was null above 2737 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2738 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD); 2739 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2740 2741 __ verify_oop(rax); 2742 2743 // Overwrite the result registers with the exception results. 2744 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2745 // I think this is useless 2746 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx); 2747 2748 __ bind(noException); 2749 2750 // Only register save data is on the stack. 2751 // Now restore the result registers. Everything else is either dead 2752 // or captured in the vframeArray. 2753 RegisterSaver::restore_result_registers(masm); 2754 2755 // All of the register save area has been popped of the stack. Only the 2756 // return address remains. 2757 2758 // Pop all the frames we must move/replace. 2759 // 2760 // Frame picture (youngest to oldest) 2761 // 1: self-frame (no frame link) 2762 // 2: deopting frame (no frame link) 2763 // 3: caller of deopting frame (could be compiled/interpreted). 2764 // 2765 // Note: by leaving the return address of self-frame on the stack 2766 // and using the size of frame 2 to adjust the stack 2767 // when we are done the return to frame 3 will still be on the stack. 2768 2769 // Pop deoptimized frame 2770 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset())); 2771 __ addptr(rsp, rcx); 2772 2773 // rsp should be pointing at the return address to the caller (3) 2774 2775 // Pick up the initial fp we should save 2776 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2777 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset())); 2778 2779 #ifdef ASSERT 2780 // Compilers generate code that bang the stack by as much as the 2781 // interpreter would need. So this stack banging should never 2782 // trigger a fault. Verify that it does not on non product builds. 2783 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset())); 2784 __ bang_stack_size(rbx, rcx); 2785 #endif 2786 2787 // Load address of array of frame pcs into rcx 2788 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset())); 2789 2790 // Trash the old pc 2791 __ addptr(rsp, wordSize); 2792 2793 // Load address of array of frame sizes into rsi 2794 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset())); 2795 2796 // Load counter into rdx 2797 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset())); 2798 2799 // Now adjust the caller's stack to make up for the extra locals 2800 // but record the original sp so that we can save it in the skeletal interpreter 2801 // frame and the stack walking of interpreter_sender will get the unextended sp 2802 // value and not the "real" sp value. 2803 2804 const Register sender_sp = r8; 2805 2806 __ mov(sender_sp, rsp); 2807 __ movl(rbx, Address(rdi, 2808 Deoptimization::UnrollBlock:: 2809 caller_adjustment_offset())); 2810 __ subptr(rsp, rbx); 2811 2812 // Push interpreter frames in a loop 2813 Label loop; 2814 __ bind(loop); 2815 __ movptr(rbx, Address(rsi, 0)); // Load frame size 2816 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand 2817 __ pushptr(Address(rcx, 0)); // Save return address 2818 __ enter(); // Save old & set new ebp 2819 __ subptr(rsp, rbx); // Prolog 2820 // This value is corrected by layout_activation_impl 2821 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD); 2822 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable 2823 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 2824 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 2825 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 2826 __ decrementl(rdx); // Decrement counter 2827 __ jcc(Assembler::notZero, loop); 2828 __ pushptr(Address(rcx, 0)); // Save final return address 2829 2830 // Re-push self-frame 2831 __ enter(); // Save old & set new ebp 2832 2833 // Allocate a full sized register save area. 2834 // Return address and rbp are in place, so we allocate two less words. 2835 __ subptr(rsp, (frame_size_in_words - 2) * wordSize); 2836 2837 // Restore frame locals after moving the frame 2838 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0); 2839 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2840 2841 // Call C code. Need thread but NOT official VM entry 2842 // crud. We cannot block on this call, no GC can happen. Call should 2843 // restore return values to their stack-slots with the new SP. 2844 // 2845 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode) 2846 2847 // Use rbp because the frames look interpreted now 2848 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 2849 // Don't need the precise return PC here, just precise enough to point into this code blob. 2850 address the_pc = __ pc(); 2851 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1); 2852 2853 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI 2854 __ mov(c_rarg0, r15_thread); 2855 __ movl(c_rarg1, r14); // second arg: exec_mode 2856 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 2857 // Revert SP alignment after call since we're going to do some SP relative addressing below 2858 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset())); 2859 2860 // Set an oopmap for the call site 2861 // Use the same PC we used for the last java frame 2862 oop_maps->add_gc_map(the_pc - start, 2863 new OopMap( frame_size_in_words, 0 )); 2864 2865 // Clear fp AND pc 2866 __ reset_last_Java_frame(true); 2867 2868 // Collect return values 2869 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes())); 2870 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes())); 2871 // I think this is useless (throwing pc?) 2872 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes())); 2873 2874 // Pop self-frame. 2875 __ leave(); // Epilog 2876 2877 // Jump to interpreter 2878 __ ret(0); 2879 2880 // Make sure all code is generated 2881 masm->flush(); 2882 2883 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words); 2884 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset); 2885 #if INCLUDE_JVMCI 2886 if (EnableJVMCI) { 2887 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset); 2888 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset); 2889 } 2890 #endif 2891 } 2892 2893 #ifdef COMPILER2 2894 //------------------------------generate_uncommon_trap_blob-------------------- 2895 void SharedRuntime::generate_uncommon_trap_blob() { 2896 // Allocate space for the code 2897 ResourceMark rm; 2898 // Setup code generation tools 2899 CodeBuffer buffer("uncommon_trap_blob", 2048, 1024); 2900 MacroAssembler* masm = new MacroAssembler(&buffer); 2901 2902 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 2903 2904 address start = __ pc(); 2905 2906 if (UseRTMLocking) { 2907 // Abort RTM transaction before possible nmethod deoptimization. 2908 __ xabort(0); 2909 } 2910 2911 // Push self-frame. We get here with a return address on the 2912 // stack, so rsp is 8-byte aligned until we allocate our frame. 2913 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog! 2914 2915 // No callee saved registers. rbp is assumed implicitly saved 2916 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 2917 2918 // compiler left unloaded_class_index in j_rarg0 move to where the 2919 // runtime expects it. 2920 __ movl(c_rarg1, j_rarg0); 2921 2922 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2923 2924 // Call C code. Need thread but NOT official VM entry 2925 // crud. We cannot block on this call, no GC can happen. Call should 2926 // capture callee-saved registers as well as return values. 2927 // Thread is in rdi already. 2928 // 2929 // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index); 2930 2931 __ mov(c_rarg0, r15_thread); 2932 __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap); 2933 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2934 2935 // Set an oopmap for the call site 2936 OopMapSet* oop_maps = new OopMapSet(); 2937 OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0); 2938 2939 // location of rbp is known implicitly by the frame sender code 2940 2941 oop_maps->add_gc_map(__ pc() - start, map); 2942 2943 __ reset_last_Java_frame(false); 2944 2945 // Load UnrollBlock* into rdi 2946 __ mov(rdi, rax); 2947 2948 #ifdef ASSERT 2949 { Label L; 2950 __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()), 2951 Deoptimization::Unpack_uncommon_trap); 2952 __ jcc(Assembler::equal, L); 2953 __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap"); 2954 __ bind(L); 2955 } 2956 #endif 2957 2958 // Pop all the frames we must move/replace. 2959 // 2960 // Frame picture (youngest to oldest) 2961 // 1: self-frame (no frame link) 2962 // 2: deopting frame (no frame link) 2963 // 3: caller of deopting frame (could be compiled/interpreted). 2964 2965 // Pop self-frame. We have no frame, and must rely only on rax and rsp. 2966 __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog! 2967 2968 // Pop deoptimized frame (int) 2969 __ movl(rcx, Address(rdi, 2970 Deoptimization::UnrollBlock:: 2971 size_of_deoptimized_frame_offset())); 2972 __ addptr(rsp, rcx); 2973 2974 // rsp should be pointing at the return address to the caller (3) 2975 2976 // Pick up the initial fp we should save 2977 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2978 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset())); 2979 2980 #ifdef ASSERT 2981 // Compilers generate code that bang the stack by as much as the 2982 // interpreter would need. So this stack banging should never 2983 // trigger a fault. Verify that it does not on non product builds. 2984 __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset())); 2985 __ bang_stack_size(rbx, rcx); 2986 #endif 2987 2988 // Load address of array of frame pcs into rcx (address*) 2989 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset())); 2990 2991 // Trash the return pc 2992 __ addptr(rsp, wordSize); 2993 2994 // Load address of array of frame sizes into rsi (intptr_t*) 2995 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset())); 2996 2997 // Counter 2998 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset())); // (int) 2999 3000 // Now adjust the caller's stack to make up for the extra locals but 3001 // record the original sp so that we can save it in the skeletal 3002 // interpreter frame and the stack walking of interpreter_sender 3003 // will get the unextended sp value and not the "real" sp value. 3004 3005 const Register sender_sp = r8; 3006 3007 __ mov(sender_sp, rsp); 3008 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset())); // (int) 3009 __ subptr(rsp, rbx); 3010 3011 // Push interpreter frames in a loop 3012 Label loop; 3013 __ bind(loop); 3014 __ movptr(rbx, Address(rsi, 0)); // Load frame size 3015 __ subptr(rbx, 2 * wordSize); // We'll push pc and rbp by hand 3016 __ pushptr(Address(rcx, 0)); // Save return address 3017 __ enter(); // Save old & set new rbp 3018 __ subptr(rsp, rbx); // Prolog 3019 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), 3020 sender_sp); // Make it walkable 3021 // This value is corrected by layout_activation_impl 3022 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD); 3023 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 3024 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 3025 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 3026 __ decrementl(rdx); // Decrement counter 3027 __ jcc(Assembler::notZero, loop); 3028 __ pushptr(Address(rcx, 0)); // Save final return address 3029 3030 // Re-push self-frame 3031 __ enter(); // Save old & set new rbp 3032 __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt); 3033 // Prolog 3034 3035 // Use rbp because the frames look interpreted now 3036 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 3037 // Don't need the precise return PC here, just precise enough to point into this code blob. 3038 address the_pc = __ pc(); 3039 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1); 3040 3041 // Call C code. Need thread but NOT official VM entry 3042 // crud. We cannot block on this call, no GC can happen. Call should 3043 // restore return values to their stack-slots with the new SP. 3044 // Thread is in rdi already. 3045 // 3046 // BasicType unpack_frames(JavaThread* thread, int exec_mode); 3047 3048 __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI 3049 __ mov(c_rarg0, r15_thread); 3050 __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap); 3051 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 3052 3053 // Set an oopmap for the call site 3054 // Use the same PC we used for the last java frame 3055 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 3056 3057 // Clear fp AND pc 3058 __ reset_last_Java_frame(true); 3059 3060 // Pop self-frame. 3061 __ leave(); // Epilog 3062 3063 // Jump to interpreter 3064 __ ret(0); 3065 3066 // Make sure all code is generated 3067 masm->flush(); 3068 3069 _uncommon_trap_blob = UncommonTrapBlob::create(&buffer, oop_maps, 3070 SimpleRuntimeFrame::framesize >> 1); 3071 } 3072 #endif // COMPILER2 3073 3074 //------------------------------generate_handler_blob------ 3075 // 3076 // Generate a special Compile2Runtime blob that saves all registers, 3077 // and setup oopmap. 3078 // 3079 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) { 3080 assert(StubRoutines::forward_exception_entry() != nullptr, 3081 "must be generated before"); 3082 3083 ResourceMark rm; 3084 OopMapSet *oop_maps = new OopMapSet(); 3085 OopMap* map; 3086 3087 // Allocate space for the code. Setup code generation tools. 3088 CodeBuffer buffer("handler_blob", 2048, 1024); 3089 MacroAssembler* masm = new MacroAssembler(&buffer); 3090 3091 address start = __ pc(); 3092 address call_pc = nullptr; 3093 int frame_size_in_words; 3094 bool cause_return = (poll_type == POLL_AT_RETURN); 3095 bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP); 3096 3097 if (UseRTMLocking) { 3098 // Abort RTM transaction before calling runtime 3099 // because critical section will be large and will be 3100 // aborted anyway. Also nmethod could be deoptimized. 3101 __ xabort(0); 3102 } 3103 3104 // Make room for return address (or push it again) 3105 if (!cause_return) { 3106 __ push(rbx); 3107 } 3108 3109 // Save registers, fpu state, and flags 3110 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors); 3111 3112 // The following is basically a call_VM. However, we need the precise 3113 // address of the call in order to generate an oopmap. Hence, we do all the 3114 // work ourselves. 3115 3116 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next: 3117 3118 // The return address must always be correct so that frame constructor never 3119 // sees an invalid pc. 3120 3121 if (!cause_return) { 3122 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack. 3123 // Additionally, rbx is a callee saved register and we can look at it later to determine 3124 // if someone changed the return address for us! 3125 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset())); 3126 __ movptr(Address(rbp, wordSize), rbx); 3127 } 3128 3129 // Do the call 3130 __ mov(c_rarg0, r15_thread); 3131 __ call(RuntimeAddress(call_ptr)); 3132 3133 // Set an oopmap for the call site. This oopmap will map all 3134 // oop-registers and debug-info registers as callee-saved. This 3135 // will allow deoptimization at this safepoint to find all possible 3136 // debug-info recordings, as well as let GC find all oops. 3137 3138 oop_maps->add_gc_map( __ pc() - start, map); 3139 3140 Label noException; 3141 3142 __ reset_last_Java_frame(false); 3143 3144 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3145 __ jcc(Assembler::equal, noException); 3146 3147 // Exception pending 3148 3149 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3150 3151 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3152 3153 // No exception case 3154 __ bind(noException); 3155 3156 Label no_adjust; 3157 #ifdef ASSERT 3158 Label bail; 3159 #endif 3160 if (!cause_return) { 3161 Label no_prefix, not_special; 3162 3163 // If our stashed return pc was modified by the runtime we avoid touching it 3164 __ cmpptr(rbx, Address(rbp, wordSize)); 3165 __ jccb(Assembler::notEqual, no_adjust); 3166 3167 // Skip over the poll instruction. 3168 // See NativeInstruction::is_safepoint_poll() 3169 // Possible encodings: 3170 // 85 00 test %eax,(%rax) 3171 // 85 01 test %eax,(%rcx) 3172 // 85 02 test %eax,(%rdx) 3173 // 85 03 test %eax,(%rbx) 3174 // 85 06 test %eax,(%rsi) 3175 // 85 07 test %eax,(%rdi) 3176 // 3177 // 41 85 00 test %eax,(%r8) 3178 // 41 85 01 test %eax,(%r9) 3179 // 41 85 02 test %eax,(%r10) 3180 // 41 85 03 test %eax,(%r11) 3181 // 41 85 06 test %eax,(%r14) 3182 // 41 85 07 test %eax,(%r15) 3183 // 3184 // 85 04 24 test %eax,(%rsp) 3185 // 41 85 04 24 test %eax,(%r12) 3186 // 85 45 00 test %eax,0x0(%rbp) 3187 // 41 85 45 00 test %eax,0x0(%r13) 3188 3189 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix); 3190 __ jcc(Assembler::notEqual, no_prefix); 3191 __ addptr(rbx, 1); 3192 __ bind(no_prefix); 3193 #ifdef ASSERT 3194 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below 3195 #endif 3196 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values: 3197 // r12/rsp 0x04 3198 // r13/rbp 0x05 3199 __ movzbq(rcx, Address(rbx, 1)); 3200 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05 3201 __ subptr(rcx, 4); // looking for 0x00 .. 0x01 3202 __ cmpptr(rcx, 1); 3203 __ jcc(Assembler::above, not_special); 3204 __ addptr(rbx, 1); 3205 __ bind(not_special); 3206 #ifdef ASSERT 3207 // Verify the correct encoding of the poll we're about to skip. 3208 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl); 3209 __ jcc(Assembler::notEqual, bail); 3210 // Mask out the modrm bits 3211 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask); 3212 // rax encodes to 0, so if the bits are nonzero it's incorrect 3213 __ jcc(Assembler::notZero, bail); 3214 #endif 3215 // Adjust return pc forward to step over the safepoint poll instruction 3216 __ addptr(rbx, 2); 3217 __ movptr(Address(rbp, wordSize), rbx); 3218 } 3219 3220 __ bind(no_adjust); 3221 // Normal exit, restore registers and exit. 3222 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3223 __ ret(0); 3224 3225 #ifdef ASSERT 3226 __ bind(bail); 3227 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected"); 3228 #endif 3229 3230 // Make sure all code is generated 3231 masm->flush(); 3232 3233 // Fill-out other meta info 3234 return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words); 3235 } 3236 3237 // 3238 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss 3239 // 3240 // Generate a stub that calls into vm to find out the proper destination 3241 // of a java call. All the argument registers are live at this point 3242 // but since this is generic code we don't know what they are and the caller 3243 // must do any gc of the args. 3244 // 3245 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) { 3246 assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before"); 3247 3248 // allocate space for the code 3249 ResourceMark rm; 3250 3251 CodeBuffer buffer(name, 1200, 512); 3252 MacroAssembler* masm = new MacroAssembler(&buffer); 3253 3254 int frame_size_in_words; 3255 3256 OopMapSet *oop_maps = new OopMapSet(); 3257 OopMap* map = nullptr; 3258 3259 int start = __ offset(); 3260 3261 // No need to save vector registers since they are caller-saved anyway. 3262 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false); 3263 3264 int frame_complete = __ offset(); 3265 3266 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 3267 3268 __ mov(c_rarg0, r15_thread); 3269 3270 __ call(RuntimeAddress(destination)); 3271 3272 3273 // Set an oopmap for the call site. 3274 // We need this not only for callee-saved registers, but also for volatile 3275 // registers that the compiler might be keeping live across a safepoint. 3276 3277 oop_maps->add_gc_map( __ offset() - start, map); 3278 3279 // rax contains the address we are going to jump to assuming no exception got installed 3280 3281 // clear last_Java_sp 3282 __ reset_last_Java_frame(false); 3283 // check for pending exceptions 3284 Label pending; 3285 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3286 __ jcc(Assembler::notEqual, pending); 3287 3288 // get the returned Method* 3289 __ get_vm_result_2(rbx, r15_thread); 3290 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx); 3291 3292 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 3293 3294 RegisterSaver::restore_live_registers(masm); 3295 3296 // We are back to the original state on entry and ready to go. 3297 3298 __ jmp(rax); 3299 3300 // Pending exception after the safepoint 3301 3302 __ bind(pending); 3303 3304 RegisterSaver::restore_live_registers(masm); 3305 3306 // exception pending => remove activation and forward to exception handler 3307 3308 __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD); 3309 3310 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 3311 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3312 3313 // ------------- 3314 // make sure all code is generated 3315 masm->flush(); 3316 3317 // return the blob 3318 // frame_size_words or bytes?? 3319 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); 3320 } 3321 3322 //------------------------------Montgomery multiplication------------------------ 3323 // 3324 3325 #ifndef _WINDOWS 3326 3327 // Subtract 0:b from carry:a. Return carry. 3328 static julong 3329 sub(julong a[], julong b[], julong carry, long len) { 3330 long long i = 0, cnt = len; 3331 julong tmp; 3332 asm volatile("clc; " 3333 "0: ; " 3334 "mov (%[b], %[i], 8), %[tmp]; " 3335 "sbb %[tmp], (%[a], %[i], 8); " 3336 "inc %[i]; dec %[cnt]; " 3337 "jne 0b; " 3338 "mov %[carry], %[tmp]; sbb $0, %[tmp]; " 3339 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp) 3340 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry) 3341 : "memory"); 3342 return tmp; 3343 } 3344 3345 // Multiply (unsigned) Long A by Long B, accumulating the double- 3346 // length result into the accumulator formed of T0, T1, and T2. 3347 #define MACC(A, B, T0, T1, T2) \ 3348 do { \ 3349 unsigned long hi, lo; \ 3350 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3351 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3352 : "r"(A), "a"(B) : "cc"); \ 3353 } while(0) 3354 3355 // As above, but add twice the double-length result into the 3356 // accumulator. 3357 #define MACC2(A, B, T0, T1, T2) \ 3358 do { \ 3359 unsigned long hi, lo; \ 3360 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \ 3361 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3362 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3363 : "r"(A), "a"(B) : "cc"); \ 3364 } while(0) 3365 3366 #else //_WINDOWS 3367 3368 static julong 3369 sub(julong a[], julong b[], julong carry, long len) { 3370 long i; 3371 julong tmp; 3372 unsigned char c = 1; 3373 for (i = 0; i < len; i++) { 3374 c = _addcarry_u64(c, a[i], ~b[i], &tmp); 3375 a[i] = tmp; 3376 } 3377 c = _addcarry_u64(c, carry, ~0, &tmp); 3378 return tmp; 3379 } 3380 3381 // Multiply (unsigned) Long A by Long B, accumulating the double- 3382 // length result into the accumulator formed of T0, T1, and T2. 3383 #define MACC(A, B, T0, T1, T2) \ 3384 do { \ 3385 julong hi, lo; \ 3386 lo = _umul128(A, B, &hi); \ 3387 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3388 c = _addcarry_u64(c, hi, T1, &T1); \ 3389 _addcarry_u64(c, T2, 0, &T2); \ 3390 } while(0) 3391 3392 // As above, but add twice the double-length result into the 3393 // accumulator. 3394 #define MACC2(A, B, T0, T1, T2) \ 3395 do { \ 3396 julong hi, lo; \ 3397 lo = _umul128(A, B, &hi); \ 3398 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3399 c = _addcarry_u64(c, hi, T1, &T1); \ 3400 _addcarry_u64(c, T2, 0, &T2); \ 3401 c = _addcarry_u64(0, lo, T0, &T0); \ 3402 c = _addcarry_u64(c, hi, T1, &T1); \ 3403 _addcarry_u64(c, T2, 0, &T2); \ 3404 } while(0) 3405 3406 #endif //_WINDOWS 3407 3408 // Fast Montgomery multiplication. The derivation of the algorithm is 3409 // in A Cryptographic Library for the Motorola DSP56000, 3410 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3411 3412 static void NOINLINE 3413 montgomery_multiply(julong a[], julong b[], julong n[], 3414 julong m[], julong inv, int len) { 3415 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3416 int i; 3417 3418 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply"); 3419 3420 for (i = 0; i < len; i++) { 3421 int j; 3422 for (j = 0; j < i; j++) { 3423 MACC(a[j], b[i-j], t0, t1, t2); 3424 MACC(m[j], n[i-j], t0, t1, t2); 3425 } 3426 MACC(a[i], b[0], t0, t1, t2); 3427 m[i] = t0 * inv; 3428 MACC(m[i], n[0], t0, t1, t2); 3429 3430 assert(t0 == 0, "broken Montgomery multiply"); 3431 3432 t0 = t1; t1 = t2; t2 = 0; 3433 } 3434 3435 for (i = len; i < 2*len; i++) { 3436 int j; 3437 for (j = i-len+1; j < len; j++) { 3438 MACC(a[j], b[i-j], t0, t1, t2); 3439 MACC(m[j], n[i-j], t0, t1, t2); 3440 } 3441 m[i-len] = t0; 3442 t0 = t1; t1 = t2; t2 = 0; 3443 } 3444 3445 while (t0) 3446 t0 = sub(m, n, t0, len); 3447 } 3448 3449 // Fast Montgomery squaring. This uses asymptotically 25% fewer 3450 // multiplies so it should be up to 25% faster than Montgomery 3451 // multiplication. However, its loop control is more complex and it 3452 // may actually run slower on some machines. 3453 3454 static void NOINLINE 3455 montgomery_square(julong a[], julong n[], 3456 julong m[], julong inv, int len) { 3457 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3458 int i; 3459 3460 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square"); 3461 3462 for (i = 0; i < len; i++) { 3463 int j; 3464 int end = (i+1)/2; 3465 for (j = 0; j < end; j++) { 3466 MACC2(a[j], a[i-j], t0, t1, t2); 3467 MACC(m[j], n[i-j], t0, t1, t2); 3468 } 3469 if ((i & 1) == 0) { 3470 MACC(a[j], a[j], t0, t1, t2); 3471 } 3472 for (; j < i; j++) { 3473 MACC(m[j], n[i-j], t0, t1, t2); 3474 } 3475 m[i] = t0 * inv; 3476 MACC(m[i], n[0], t0, t1, t2); 3477 3478 assert(t0 == 0, "broken Montgomery square"); 3479 3480 t0 = t1; t1 = t2; t2 = 0; 3481 } 3482 3483 for (i = len; i < 2*len; i++) { 3484 int start = i-len+1; 3485 int end = start + (len - start)/2; 3486 int j; 3487 for (j = start; j < end; j++) { 3488 MACC2(a[j], a[i-j], t0, t1, t2); 3489 MACC(m[j], n[i-j], t0, t1, t2); 3490 } 3491 if ((i & 1) == 0) { 3492 MACC(a[j], a[j], t0, t1, t2); 3493 } 3494 for (; j < len; j++) { 3495 MACC(m[j], n[i-j], t0, t1, t2); 3496 } 3497 m[i-len] = t0; 3498 t0 = t1; t1 = t2; t2 = 0; 3499 } 3500 3501 while (t0) 3502 t0 = sub(m, n, t0, len); 3503 } 3504 3505 // Swap words in a longword. 3506 static julong swap(julong x) { 3507 return (x << 32) | (x >> 32); 3508 } 3509 3510 // Copy len longwords from s to d, word-swapping as we go. The 3511 // destination array is reversed. 3512 static void reverse_words(julong *s, julong *d, int len) { 3513 d += len; 3514 while(len-- > 0) { 3515 d--; 3516 *d = swap(*s); 3517 s++; 3518 } 3519 } 3520 3521 // The threshold at which squaring is advantageous was determined 3522 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. 3523 #define MONTGOMERY_SQUARING_THRESHOLD 64 3524 3525 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, 3526 jint len, jlong inv, 3527 jint *m_ints) { 3528 assert(len % 2 == 0, "array length in montgomery_multiply must be even"); 3529 int longwords = len/2; 3530 3531 // Make very sure we don't use so much space that the stack might 3532 // overflow. 512 jints corresponds to an 16384-bit integer and 3533 // will use here a total of 8k bytes of stack space. 3534 int divisor = sizeof(julong) * 4; 3535 guarantee(longwords <= 8192 / divisor, "must be"); 3536 int total_allocation = longwords * sizeof (julong) * 4; 3537 julong *scratch = (julong *)alloca(total_allocation); 3538 3539 // Local scratch arrays 3540 julong 3541 *a = scratch + 0 * longwords, 3542 *b = scratch + 1 * longwords, 3543 *n = scratch + 2 * longwords, 3544 *m = scratch + 3 * longwords; 3545 3546 reverse_words((julong *)a_ints, a, longwords); 3547 reverse_words((julong *)b_ints, b, longwords); 3548 reverse_words((julong *)n_ints, n, longwords); 3549 3550 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords); 3551 3552 reverse_words(m, (julong *)m_ints, longwords); 3553 } 3554 3555 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, 3556 jint len, jlong inv, 3557 jint *m_ints) { 3558 assert(len % 2 == 0, "array length in montgomery_square must be even"); 3559 int longwords = len/2; 3560 3561 // Make very sure we don't use so much space that the stack might 3562 // overflow. 512 jints corresponds to an 16384-bit integer and 3563 // will use here a total of 6k bytes of stack space. 3564 int divisor = sizeof(julong) * 3; 3565 guarantee(longwords <= (8192 / divisor), "must be"); 3566 int total_allocation = longwords * sizeof (julong) * 3; 3567 julong *scratch = (julong *)alloca(total_allocation); 3568 3569 // Local scratch arrays 3570 julong 3571 *a = scratch + 0 * longwords, 3572 *n = scratch + 1 * longwords, 3573 *m = scratch + 2 * longwords; 3574 3575 reverse_words((julong *)a_ints, a, longwords); 3576 reverse_words((julong *)n_ints, n, longwords); 3577 3578 if (len >= MONTGOMERY_SQUARING_THRESHOLD) { 3579 ::montgomery_square(a, n, m, (julong)inv, longwords); 3580 } else { 3581 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords); 3582 } 3583 3584 reverse_words(m, (julong *)m_ints, longwords); 3585 } 3586 3587 #ifdef COMPILER2 3588 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame 3589 // 3590 //------------------------------generate_exception_blob--------------------------- 3591 // creates exception blob at the end 3592 // Using exception blob, this code is jumped from a compiled method. 3593 // (see emit_exception_handler in x86_64.ad file) 3594 // 3595 // Given an exception pc at a call we call into the runtime for the 3596 // handler in this method. This handler might merely restore state 3597 // (i.e. callee save registers) unwind the frame and jump to the 3598 // exception handler for the nmethod if there is no Java level handler 3599 // for the nmethod. 3600 // 3601 // This code is entered with a jmp. 3602 // 3603 // Arguments: 3604 // rax: exception oop 3605 // rdx: exception pc 3606 // 3607 // Results: 3608 // rax: exception oop 3609 // rdx: exception pc in caller or ??? 3610 // destination: exception handler of caller 3611 // 3612 // Note: the exception pc MUST be at a call (precise debug information) 3613 // Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved. 3614 // 3615 3616 void OptoRuntime::generate_exception_blob() { 3617 assert(!OptoRuntime::is_callee_saved_register(RDX_num), ""); 3618 assert(!OptoRuntime::is_callee_saved_register(RAX_num), ""); 3619 assert(!OptoRuntime::is_callee_saved_register(RCX_num), ""); 3620 3621 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 3622 3623 // Allocate space for the code 3624 ResourceMark rm; 3625 // Setup code generation tools 3626 CodeBuffer buffer("exception_blob", 2048, 1024); 3627 MacroAssembler* masm = new MacroAssembler(&buffer); 3628 3629 3630 address start = __ pc(); 3631 3632 // Exception pc is 'return address' for stack walker 3633 __ push(rdx); 3634 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog 3635 3636 // Save callee-saved registers. See x86_64.ad. 3637 3638 // rbp is an implicitly saved callee saved register (i.e., the calling 3639 // convention will save/restore it in the prolog/epilog). Other than that 3640 // there are no callee save registers now that adapter frames are gone. 3641 3642 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 3643 3644 // Store exception in Thread object. We cannot pass any arguments to the 3645 // handle_exception call, since we do not want to make any assumption 3646 // about the size of the frame where the exception happened in. 3647 // c_rarg0 is either rdi (Linux) or rcx (Windows). 3648 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax); 3649 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 3650 3651 // This call does all the hard work. It checks if an exception handler 3652 // exists in the method. 3653 // If so, it returns the handler address. 3654 // If not, it prepares for stack-unwinding, restoring the callee-save 3655 // registers of the frame being removed. 3656 // 3657 // address OptoRuntime::handle_exception_C(JavaThread* thread) 3658 3659 // At a method handle call, the stack may not be properly aligned 3660 // when returning with an exception. 3661 address the_pc = __ pc(); 3662 __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1); 3663 __ mov(c_rarg0, r15_thread); 3664 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack 3665 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C))); 3666 3667 // Set an oopmap for the call site. This oopmap will only be used if we 3668 // are unwinding the stack. Hence, all locations will be dead. 3669 // Callee-saved registers will be the same as the frame above (i.e., 3670 // handle_exception_stub), since they were restored when we got the 3671 // exception. 3672 3673 OopMapSet* oop_maps = new OopMapSet(); 3674 3675 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 3676 3677 __ reset_last_Java_frame(false); 3678 3679 // Restore callee-saved registers 3680 3681 // rbp is an implicitly saved callee-saved register (i.e., the calling 3682 // convention will save restore it in prolog/epilog) Other than that 3683 // there are no callee save registers now that adapter frames are gone. 3684 3685 __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt)); 3686 3687 __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog 3688 __ pop(rdx); // No need for exception pc anymore 3689 3690 // rax: exception handler 3691 3692 // We have a handler in rax (could be deopt blob). 3693 __ mov(r8, rax); 3694 3695 // Get the exception oop 3696 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 3697 // Get the exception pc in case we are deoptimized 3698 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 3699 #ifdef ASSERT 3700 __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD); 3701 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 3702 #endif 3703 // Clear the exception oop so GC no longer processes it as a root. 3704 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD); 3705 3706 // rax: exception oop 3707 // r8: exception handler 3708 // rdx: exception pc 3709 // Jump to handler 3710 3711 __ jmp(r8); 3712 3713 // Make sure all code is generated 3714 masm->flush(); 3715 3716 // Set exception blob 3717 _exception_blob = ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1); 3718 } 3719 #endif // COMPILER2 3720