1 /* 2 * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #ifndef _WINDOWS 27 #include "alloca.h" 28 #endif 29 #include "asm/macroAssembler.hpp" 30 #include "asm/macroAssembler.inline.hpp" 31 #include "code/compiledIC.hpp" 32 #include "code/debugInfoRec.hpp" 33 #include "code/icBuffer.hpp" 34 #include "code/nativeInst.hpp" 35 #include "code/vtableStubs.hpp" 36 #include "compiler/oopMap.hpp" 37 #include "gc/shared/collectedHeap.hpp" 38 #include "gc/shared/gcLocker.hpp" 39 #include "gc/shared/barrierSet.hpp" 40 #include "gc/shared/barrierSetAssembler.hpp" 41 #include "interpreter/interpreter.hpp" 42 #include "logging/log.hpp" 43 #include "memory/resourceArea.hpp" 44 #include "memory/universe.hpp" 45 #include "oops/compiledICHolder.hpp" 46 #include "oops/klass.inline.hpp" 47 #include "oops/method.inline.hpp" 48 #include "prims/methodHandles.hpp" 49 #include "runtime/continuation.hpp" 50 #include "runtime/continuationEntry.inline.hpp" 51 #include "runtime/globals.hpp" 52 #include "runtime/jniHandles.hpp" 53 #include "runtime/safepointMechanism.hpp" 54 #include "runtime/sharedRuntime.hpp" 55 #include "runtime/signature.hpp" 56 #include "runtime/stubRoutines.hpp" 57 #include "runtime/vframeArray.hpp" 58 #include "runtime/vm_version.hpp" 59 #include "utilities/align.hpp" 60 #include "utilities/formatBuffer.hpp" 61 #include "vmreg_x86.inline.hpp" 62 #ifdef COMPILER1 63 #include "c1/c1_Runtime1.hpp" 64 #endif 65 #ifdef COMPILER2 66 #include "opto/runtime.hpp" 67 #endif 68 #if INCLUDE_JVMCI 69 #include "jvmci/jvmciJavaClasses.hpp" 70 #endif 71 72 #define __ masm-> 73 74 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size; 75 76 class SimpleRuntimeFrame { 77 78 public: 79 80 // Most of the runtime stubs have this simple frame layout. 81 // This class exists to make the layout shared in one place. 82 // Offsets are for compiler stack slots, which are jints. 83 enum layout { 84 // The frame sender code expects that rbp will be in the "natural" place and 85 // will override any oopMap setting for it. We must therefore force the layout 86 // so that it agrees with the frame sender code. 87 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt, 88 rbp_off2, 89 return_off, return_off2, 90 framesize 91 }; 92 }; 93 94 class RegisterSaver { 95 // Capture info about frame layout. Layout offsets are in jint 96 // units because compiler frame slots are jints. 97 #define XSAVE_AREA_BEGIN 160 98 #define XSAVE_AREA_YMM_BEGIN 576 99 #define XSAVE_AREA_OPMASK_BEGIN 1088 100 #define XSAVE_AREA_ZMM_BEGIN 1152 101 #define XSAVE_AREA_UPPERBANK 1664 102 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off 103 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off 104 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off 105 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off 106 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off 107 enum layout { 108 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area 109 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area 110 DEF_XMM_OFFS(0), 111 DEF_XMM_OFFS(1), 112 // 2..15 are implied in range usage 113 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 114 DEF_YMM_OFFS(0), 115 DEF_YMM_OFFS(1), 116 // 2..15 are implied in range usage 117 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 118 DEF_OPMASK_OFFS(0), 119 DEF_OPMASK_OFFS(1), 120 // 2..7 are implied in range usage 121 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 122 DEF_ZMM_OFFS(0), 123 DEF_ZMM_OFFS(1), 124 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt, 125 DEF_ZMM_UPPER_OFFS(16), 126 DEF_ZMM_UPPER_OFFS(17), 127 // 18..31 are implied in range usage 128 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt), 129 fpu_stateH_end, 130 r15_off, r15H_off, 131 r14_off, r14H_off, 132 r13_off, r13H_off, 133 r12_off, r12H_off, 134 r11_off, r11H_off, 135 r10_off, r10H_off, 136 r9_off, r9H_off, 137 r8_off, r8H_off, 138 rdi_off, rdiH_off, 139 rsi_off, rsiH_off, 140 ignore_off, ignoreH_off, // extra copy of rbp 141 rsp_off, rspH_off, 142 rbx_off, rbxH_off, 143 rdx_off, rdxH_off, 144 rcx_off, rcxH_off, 145 rax_off, raxH_off, 146 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state 147 align_off, alignH_off, 148 flags_off, flagsH_off, 149 // The frame sender code expects that rbp will be in the "natural" place and 150 // will override any oopMap setting for it. We must therefore force the layout 151 // so that it agrees with the frame sender code. 152 rbp_off, rbpH_off, // copy of rbp we will restore 153 return_off, returnH_off, // slot for return address 154 reg_save_size // size in compiler stack slots 155 }; 156 157 public: 158 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors); 159 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false); 160 161 // Offsets into the register save area 162 // Used by deoptimization when it is managing result register 163 // values on its own 164 165 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; } 166 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; } 167 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; } 168 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; } 169 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; } 170 171 // During deoptimization only the result registers need to be restored, 172 // all the other values have already been extracted. 173 static void restore_result_registers(MacroAssembler* masm); 174 }; 175 176 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) { 177 int off = 0; 178 int num_xmm_regs = XMMRegister::available_xmm_registers(); 179 #if COMPILER2_OR_JVMCI 180 if (save_wide_vectors && UseAVX == 0) { 181 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX 182 } 183 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 184 #else 185 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI 186 #endif 187 188 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated 189 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs); 190 // OopMap frame size is in compiler stack slots (jint's) not bytes or words 191 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 192 // CodeBlob frame size is in words. 193 int frame_size_in_words = frame_size_in_bytes / wordSize; 194 *total_frame_words = frame_size_in_words; 195 196 // Save registers, fpu state, and flags. 197 // We assume caller has already pushed the return address onto the 198 // stack, so rsp is 8-byte aligned here. 199 // We push rpb twice in this sequence because we want the real rbp 200 // to be under the return like a normal enter. 201 202 __ enter(); // rsp becomes 16-byte aligned here 203 __ push_CPU_state(); // Push a multiple of 16 bytes 204 205 // push cpu state handles this on EVEX enabled targets 206 if (save_wide_vectors) { 207 // Save upper half of YMM registers(0..15) 208 int base_addr = XSAVE_AREA_YMM_BEGIN; 209 for (int n = 0; n < 16; n++) { 210 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n)); 211 } 212 if (VM_Version::supports_evex()) { 213 // Save upper half of ZMM registers(0..15) 214 base_addr = XSAVE_AREA_ZMM_BEGIN; 215 for (int n = 0; n < 16; n++) { 216 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n)); 217 } 218 // Save full ZMM registers(16..num_xmm_regs) 219 base_addr = XSAVE_AREA_UPPERBANK; 220 off = 0; 221 int vector_len = Assembler::AVX_512bit; 222 for (int n = 16; n < num_xmm_regs; n++) { 223 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 224 } 225 #if COMPILER2_OR_JVMCI 226 base_addr = XSAVE_AREA_OPMASK_BEGIN; 227 off = 0; 228 for(int n = 0; n < KRegister::number_of_registers; n++) { 229 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 230 } 231 #endif 232 } 233 } else { 234 if (VM_Version::supports_evex()) { 235 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 236 int base_addr = XSAVE_AREA_UPPERBANK; 237 off = 0; 238 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 239 for (int n = 16; n < num_xmm_regs; n++) { 240 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 241 } 242 #if COMPILER2_OR_JVMCI 243 base_addr = XSAVE_AREA_OPMASK_BEGIN; 244 off = 0; 245 for(int n = 0; n < KRegister::number_of_registers; n++) { 246 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 247 } 248 #endif 249 } 250 } 251 __ vzeroupper(); 252 if (frame::arg_reg_save_area_bytes != 0) { 253 // Allocate argument register save area 254 __ subptr(rsp, frame::arg_reg_save_area_bytes); 255 } 256 257 // Set an oopmap for the call site. This oopmap will map all 258 // oop-registers and debug-info registers as callee-saved. This 259 // will allow deoptimization at this safepoint to find all possible 260 // debug-info recordings, as well as let GC find all oops. 261 262 OopMapSet *oop_maps = new OopMapSet(); 263 OopMap* map = new OopMap(frame_size_in_slots, 0); 264 265 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x)) 266 267 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg()); 268 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg()); 269 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg()); 270 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg()); 271 // rbp location is known implicitly by the frame sender code, needs no oopmap 272 // and the location where rbp was saved by is ignored 273 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg()); 274 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg()); 275 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg()); 276 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg()); 277 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg()); 278 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg()); 279 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg()); 280 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg()); 281 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg()); 282 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg()); 283 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 284 // on EVEX enabled targets, we get it included in the xsave area 285 off = xmm0_off; 286 int delta = xmm1_off - off; 287 for (int n = 0; n < 16; n++) { 288 XMMRegister xmm_name = as_XMMRegister(n); 289 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()); 290 off += delta; 291 } 292 if (UseAVX > 2) { 293 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 294 off = zmm16_off; 295 delta = zmm17_off - off; 296 for (int n = 16; n < num_xmm_regs; n++) { 297 XMMRegister zmm_name = as_XMMRegister(n); 298 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()); 299 off += delta; 300 } 301 } 302 303 #if COMPILER2_OR_JVMCI 304 if (save_wide_vectors) { 305 // Save upper half of YMM registers(0..15) 306 off = ymm0_off; 307 delta = ymm1_off - ymm0_off; 308 for (int n = 0; n < 16; n++) { 309 XMMRegister ymm_name = as_XMMRegister(n); 310 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4)); 311 off += delta; 312 } 313 if (VM_Version::supports_evex()) { 314 // Save upper half of ZMM registers(0..15) 315 off = zmm0_off; 316 delta = zmm1_off - zmm0_off; 317 for (int n = 0; n < 16; n++) { 318 XMMRegister zmm_name = as_XMMRegister(n); 319 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8)); 320 off += delta; 321 } 322 } 323 } 324 #endif // COMPILER2_OR_JVMCI 325 326 // %%% These should all be a waste but we'll keep things as they were for now 327 if (true) { 328 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next()); 329 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next()); 330 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next()); 331 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next()); 332 // rbp location is known implicitly by the frame sender code, needs no oopmap 333 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next()); 334 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next()); 335 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next()); 336 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next()); 337 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next()); 338 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next()); 339 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next()); 340 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next()); 341 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next()); 342 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next()); 343 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 344 // on EVEX enabled targets, we get it included in the xsave area 345 off = xmm0H_off; 346 delta = xmm1H_off - off; 347 for (int n = 0; n < 16; n++) { 348 XMMRegister xmm_name = as_XMMRegister(n); 349 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next()); 350 off += delta; 351 } 352 if (UseAVX > 2) { 353 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 354 off = zmm16H_off; 355 delta = zmm17H_off - off; 356 for (int n = 16; n < num_xmm_regs; n++) { 357 XMMRegister zmm_name = as_XMMRegister(n); 358 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next()); 359 off += delta; 360 } 361 } 362 } 363 364 return map; 365 } 366 367 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) { 368 int num_xmm_regs = XMMRegister::available_xmm_registers(); 369 if (frame::arg_reg_save_area_bytes != 0) { 370 // Pop arg register save area 371 __ addptr(rsp, frame::arg_reg_save_area_bytes); 372 } 373 374 #if COMPILER2_OR_JVMCI 375 if (restore_wide_vectors) { 376 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); 377 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 378 } 379 #else 380 assert(!restore_wide_vectors, "vectors are generated only by C2"); 381 #endif 382 383 __ vzeroupper(); 384 385 // On EVEX enabled targets everything is handled in pop fpu state 386 if (restore_wide_vectors) { 387 // Restore upper half of YMM registers (0..15) 388 int base_addr = XSAVE_AREA_YMM_BEGIN; 389 for (int n = 0; n < 16; n++) { 390 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16)); 391 } 392 if (VM_Version::supports_evex()) { 393 // Restore upper half of ZMM registers (0..15) 394 base_addr = XSAVE_AREA_ZMM_BEGIN; 395 for (int n = 0; n < 16; n++) { 396 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32)); 397 } 398 // Restore full ZMM registers(16..num_xmm_regs) 399 base_addr = XSAVE_AREA_UPPERBANK; 400 int vector_len = Assembler::AVX_512bit; 401 int off = 0; 402 for (int n = 16; n < num_xmm_regs; n++) { 403 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 404 } 405 #if COMPILER2_OR_JVMCI 406 base_addr = XSAVE_AREA_OPMASK_BEGIN; 407 off = 0; 408 for (int n = 0; n < KRegister::number_of_registers; n++) { 409 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 410 } 411 #endif 412 } 413 } else { 414 if (VM_Version::supports_evex()) { 415 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 416 int base_addr = XSAVE_AREA_UPPERBANK; 417 int off = 0; 418 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 419 for (int n = 16; n < num_xmm_regs; n++) { 420 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 421 } 422 #if COMPILER2_OR_JVMCI 423 base_addr = XSAVE_AREA_OPMASK_BEGIN; 424 off = 0; 425 for (int n = 0; n < KRegister::number_of_registers; n++) { 426 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 427 } 428 #endif 429 } 430 } 431 432 // Recover CPU state 433 __ pop_CPU_state(); 434 // Get the rbp described implicitly by the calling convention (no oopMap) 435 __ pop(rbp); 436 } 437 438 void RegisterSaver::restore_result_registers(MacroAssembler* masm) { 439 440 // Just restore result register. Only used by deoptimization. By 441 // now any callee save register that needs to be restored to a c2 442 // caller of the deoptee has been extracted into the vframeArray 443 // and will be stuffed into the c2i adapter we create for later 444 // restoration so only result registers need to be restored here. 445 446 // Restore fp result register 447 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes())); 448 // Restore integer result register 449 __ movptr(rax, Address(rsp, rax_offset_in_bytes())); 450 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes())); 451 452 // Pop all of the register save are off the stack except the return address 453 __ addptr(rsp, return_offset_in_bytes()); 454 } 455 456 // Is vector's size (in bytes) bigger than a size saved by default? 457 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions. 458 bool SharedRuntime::is_wide_vector(int size) { 459 return size > 16; 460 } 461 462 // --------------------------------------------------------------------------- 463 // Read the array of BasicTypes from a signature, and compute where the 464 // arguments should go. Values in the VMRegPair regs array refer to 4-byte 465 // quantities. Values less than VMRegImpl::stack0 are registers, those above 466 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer 467 // as framesizes are fixed. 468 // VMRegImpl::stack0 refers to the first slot 0(sp). 469 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher. 470 // Register up to Register::number_of_registers are the 64-bit 471 // integer registers. 472 473 // Note: the INPUTS in sig_bt are in units of Java argument words, which are 474 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit 475 // units regardless of build. Of course for i486 there is no 64 bit build 476 477 // The Java calling convention is a "shifted" version of the C ABI. 478 // By skipping the first C ABI register we can call non-static jni methods 479 // with small numbers of arguments without having to shuffle the arguments 480 // at all. Since we control the java ABI we ought to at least get some 481 // advantage out of it. 482 483 int SharedRuntime::java_calling_convention(const BasicType *sig_bt, 484 VMRegPair *regs, 485 int total_args_passed) { 486 487 // Create the mapping between argument positions and 488 // registers. 489 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = { 490 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5 491 }; 492 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = { 493 j_farg0, j_farg1, j_farg2, j_farg3, 494 j_farg4, j_farg5, j_farg6, j_farg7 495 }; 496 497 498 uint int_args = 0; 499 uint fp_args = 0; 500 uint stk_args = 0; // inc by 2 each time 501 502 for (int i = 0; i < total_args_passed; i++) { 503 switch (sig_bt[i]) { 504 case T_BOOLEAN: 505 case T_CHAR: 506 case T_BYTE: 507 case T_SHORT: 508 case T_INT: 509 if (int_args < Argument::n_int_register_parameters_j) { 510 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 511 } else { 512 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 513 stk_args += 2; 514 } 515 break; 516 case T_VOID: 517 // halves of T_LONG or T_DOUBLE 518 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 519 regs[i].set_bad(); 520 break; 521 case T_LONG: 522 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 523 // fall through 524 case T_OBJECT: 525 case T_ARRAY: 526 case T_ADDRESS: 527 if (int_args < Argument::n_int_register_parameters_j) { 528 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 529 } else { 530 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 531 stk_args += 2; 532 } 533 break; 534 case T_FLOAT: 535 if (fp_args < Argument::n_float_register_parameters_j) { 536 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 537 } else { 538 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 539 stk_args += 2; 540 } 541 break; 542 case T_DOUBLE: 543 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 544 if (fp_args < Argument::n_float_register_parameters_j) { 545 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 546 } else { 547 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 548 stk_args += 2; 549 } 550 break; 551 default: 552 ShouldNotReachHere(); 553 break; 554 } 555 } 556 557 return align_up(stk_args, 2); 558 } 559 560 // Patch the callers callsite with entry to compiled code if it exists. 561 static void patch_callers_callsite(MacroAssembler *masm) { 562 Label L; 563 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 564 __ jcc(Assembler::equal, L); 565 566 // Save the current stack pointer 567 __ mov(r13, rsp); 568 // Schedule the branch target address early. 569 // Call into the VM to patch the caller, then jump to compiled callee 570 // rax isn't live so capture return address while we easily can 571 __ movptr(rax, Address(rsp, 0)); 572 573 // align stack so push_CPU_state doesn't fault 574 __ andptr(rsp, -(StackAlignmentInBytes)); 575 __ push_CPU_state(); 576 __ vzeroupper(); 577 // VM needs caller's callsite 578 // VM needs target method 579 // This needs to be a long call since we will relocate this adapter to 580 // the codeBuffer and it may not reach 581 582 // Allocate argument register save area 583 if (frame::arg_reg_save_area_bytes != 0) { 584 __ subptr(rsp, frame::arg_reg_save_area_bytes); 585 } 586 __ mov(c_rarg0, rbx); 587 __ mov(c_rarg1, rax); 588 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite))); 589 590 // De-allocate argument register save area 591 if (frame::arg_reg_save_area_bytes != 0) { 592 __ addptr(rsp, frame::arg_reg_save_area_bytes); 593 } 594 595 __ vzeroupper(); 596 __ pop_CPU_state(); 597 // restore sp 598 __ mov(rsp, r13); 599 __ bind(L); 600 } 601 602 603 static void gen_c2i_adapter(MacroAssembler *masm, 604 int total_args_passed, 605 int comp_args_on_stack, 606 const BasicType *sig_bt, 607 const VMRegPair *regs, 608 Label& skip_fixup) { 609 // Before we get into the guts of the C2I adapter, see if we should be here 610 // at all. We've come from compiled code and are attempting to jump to the 611 // interpreter, which means the caller made a static call to get here 612 // (vcalls always get a compiled target if there is one). Check for a 613 // compiled target. If there is one, we need to patch the caller's call. 614 patch_callers_callsite(masm); 615 616 __ bind(skip_fixup); 617 618 // Since all args are passed on the stack, total_args_passed * 619 // Interpreter::stackElementSize is the space we need. 620 621 assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed); 622 623 int extraspace = (total_args_passed * Interpreter::stackElementSize); 624 625 // stack is aligned, keep it that way 626 // This is not currently needed or enforced by the interpreter, but 627 // we might as well conform to the ABI. 628 extraspace = align_up(extraspace, 2*wordSize); 629 630 // set senderSP value 631 __ lea(r13, Address(rsp, wordSize)); 632 633 #ifdef ASSERT 634 __ check_stack_alignment(r13, "sender stack not aligned"); 635 #endif 636 if (extraspace > 0) { 637 // Pop the return address 638 __ pop(rax); 639 640 __ subptr(rsp, extraspace); 641 642 // Push the return address 643 __ push(rax); 644 645 // Account for the return address location since we store it first rather 646 // than hold it in a register across all the shuffling 647 extraspace += wordSize; 648 } 649 650 #ifdef ASSERT 651 __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax); 652 #endif 653 654 // Now write the args into the outgoing interpreter space 655 for (int i = 0; i < total_args_passed; i++) { 656 if (sig_bt[i] == T_VOID) { 657 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 658 continue; 659 } 660 661 // offset to start parameters 662 int st_off = (total_args_passed - i) * Interpreter::stackElementSize; 663 int next_off = st_off - Interpreter::stackElementSize; 664 665 // Say 4 args: 666 // i st_off 667 // 0 32 T_LONG 668 // 1 24 T_VOID 669 // 2 16 T_OBJECT 670 // 3 8 T_BOOL 671 // - 0 return address 672 // 673 // However to make thing extra confusing. Because we can fit a long/double in 674 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter 675 // leaves one slot empty and only stores to a single slot. In this case the 676 // slot that is occupied is the T_VOID slot. See I said it was confusing. 677 678 VMReg r_1 = regs[i].first(); 679 VMReg r_2 = regs[i].second(); 680 if (!r_1->is_valid()) { 681 assert(!r_2->is_valid(), ""); 682 continue; 683 } 684 if (r_1->is_stack()) { 685 // memory to memory use rax 686 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace; 687 if (!r_2->is_valid()) { 688 // sign extend?? 689 __ movl(rax, Address(rsp, ld_off)); 690 __ movptr(Address(rsp, st_off), rax); 691 692 } else { 693 694 __ movq(rax, Address(rsp, ld_off)); 695 696 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 697 // T_DOUBLE and T_LONG use two slots in the interpreter 698 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 699 // ld_off == LSW, ld_off+wordSize == MSW 700 // st_off == MSW, next_off == LSW 701 __ movq(Address(rsp, next_off), rax); 702 #ifdef ASSERT 703 // Overwrite the unused slot with known junk 704 __ mov64(rax, CONST64(0xdeadffffdeadaaaa)); 705 __ movptr(Address(rsp, st_off), rax); 706 #endif /* ASSERT */ 707 } else { 708 __ movq(Address(rsp, st_off), rax); 709 } 710 } 711 } else if (r_1->is_Register()) { 712 Register r = r_1->as_Register(); 713 if (!r_2->is_valid()) { 714 // must be only an int (or less ) so move only 32bits to slot 715 // why not sign extend?? 716 __ movl(Address(rsp, st_off), r); 717 } else { 718 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 719 // T_DOUBLE and T_LONG use two slots in the interpreter 720 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 721 // long/double in gpr 722 #ifdef ASSERT 723 // Overwrite the unused slot with known junk 724 __ mov64(rax, CONST64(0xdeadffffdeadaaab)); 725 __ movptr(Address(rsp, st_off), rax); 726 #endif /* ASSERT */ 727 __ movq(Address(rsp, next_off), r); 728 } else { 729 __ movptr(Address(rsp, st_off), r); 730 } 731 } 732 } else { 733 assert(r_1->is_XMMRegister(), ""); 734 if (!r_2->is_valid()) { 735 // only a float use just part of the slot 736 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister()); 737 } else { 738 #ifdef ASSERT 739 // Overwrite the unused slot with known junk 740 __ mov64(rax, CONST64(0xdeadffffdeadaaac)); 741 __ movptr(Address(rsp, st_off), rax); 742 #endif /* ASSERT */ 743 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister()); 744 } 745 } 746 } 747 748 // Schedule the branch target address early. 749 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset()))); 750 __ jmp(rcx); 751 } 752 753 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg, 754 address code_start, address code_end, 755 Label& L_ok) { 756 Label L_fail; 757 __ lea(temp_reg, ExternalAddress(code_start)); 758 __ cmpptr(pc_reg, temp_reg); 759 __ jcc(Assembler::belowEqual, L_fail); 760 __ lea(temp_reg, ExternalAddress(code_end)); 761 __ cmpptr(pc_reg, temp_reg); 762 __ jcc(Assembler::below, L_ok); 763 __ bind(L_fail); 764 } 765 766 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm, 767 int total_args_passed, 768 int comp_args_on_stack, 769 const BasicType *sig_bt, 770 const VMRegPair *regs) { 771 772 // Note: r13 contains the senderSP on entry. We must preserve it since 773 // we may do a i2c -> c2i transition if we lose a race where compiled 774 // code goes non-entrant while we get args ready. 775 // In addition we use r13 to locate all the interpreter args as 776 // we must align the stack to 16 bytes on an i2c entry else we 777 // lose alignment we expect in all compiled code and register 778 // save code can segv when fxsave instructions find improperly 779 // aligned stack pointer. 780 781 // Adapters can be frameless because they do not require the caller 782 // to perform additional cleanup work, such as correcting the stack pointer. 783 // An i2c adapter is frameless because the *caller* frame, which is interpreted, 784 // routinely repairs its own stack pointer (from interpreter_frame_last_sp), 785 // even if a callee has modified the stack pointer. 786 // A c2i adapter is frameless because the *callee* frame, which is interpreted, 787 // routinely repairs its caller's stack pointer (from sender_sp, which is set 788 // up via the senderSP register). 789 // In other words, if *either* the caller or callee is interpreted, we can 790 // get the stack pointer repaired after a call. 791 // This is why c2i and i2c adapters cannot be indefinitely composed. 792 // In particular, if a c2i adapter were to somehow call an i2c adapter, 793 // both caller and callee would be compiled methods, and neither would 794 // clean up the stack pointer changes performed by the two adapters. 795 // If this happens, control eventually transfers back to the compiled 796 // caller, but with an uncorrected stack, causing delayed havoc. 797 798 if (VerifyAdapterCalls && 799 (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) { 800 // So, let's test for cascading c2i/i2c adapters right now. 801 // assert(Interpreter::contains($return_addr) || 802 // StubRoutines::contains($return_addr), 803 // "i2c adapter must return to an interpreter frame"); 804 __ block_comment("verify_i2c { "); 805 // Pick up the return address 806 __ movptr(rax, Address(rsp, 0)); 807 Label L_ok; 808 if (Interpreter::code() != nullptr) { 809 range_check(masm, rax, r11, 810 Interpreter::code()->code_start(), 811 Interpreter::code()->code_end(), 812 L_ok); 813 } 814 if (StubRoutines::initial_stubs_code() != nullptr) { 815 range_check(masm, rax, r11, 816 StubRoutines::initial_stubs_code()->code_begin(), 817 StubRoutines::initial_stubs_code()->code_end(), 818 L_ok); 819 } 820 if (StubRoutines::final_stubs_code() != nullptr) { 821 range_check(masm, rax, r11, 822 StubRoutines::final_stubs_code()->code_begin(), 823 StubRoutines::final_stubs_code()->code_end(), 824 L_ok); 825 } 826 const char* msg = "i2c adapter must return to an interpreter frame"; 827 __ block_comment(msg); 828 __ stop(msg); 829 __ bind(L_ok); 830 __ block_comment("} verify_i2ce "); 831 } 832 833 // Must preserve original SP for loading incoming arguments because 834 // we need to align the outgoing SP for compiled code. 835 __ movptr(r11, rsp); 836 837 // Pick up the return address 838 __ pop(rax); 839 840 // Convert 4-byte c2 stack slots to words. 841 int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord; 842 843 if (comp_args_on_stack) { 844 __ subptr(rsp, comp_words_on_stack * wordSize); 845 } 846 847 // Ensure compiled code always sees stack at proper alignment 848 __ andptr(rsp, -16); 849 850 // push the return address and misalign the stack that youngest frame always sees 851 // as far as the placement of the call instruction 852 __ push(rax); 853 854 // Put saved SP in another register 855 const Register saved_sp = rax; 856 __ movptr(saved_sp, r11); 857 858 // Will jump to the compiled code just as if compiled code was doing it. 859 // Pre-load the register-jump target early, to schedule it better. 860 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset()))); 861 862 #if INCLUDE_JVMCI 863 if (EnableJVMCI) { 864 // check if this call should be routed towards a specific entry point 865 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 866 Label no_alternative_target; 867 __ jcc(Assembler::equal, no_alternative_target); 868 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset()))); 869 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 870 __ bind(no_alternative_target); 871 } 872 #endif // INCLUDE_JVMCI 873 874 // Now generate the shuffle code. Pick up all register args and move the 875 // rest through the floating point stack top. 876 for (int i = 0; i < total_args_passed; i++) { 877 if (sig_bt[i] == T_VOID) { 878 // Longs and doubles are passed in native word order, but misaligned 879 // in the 32-bit build. 880 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 881 continue; 882 } 883 884 // Pick up 0, 1 or 2 words from SP+offset. 885 886 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), 887 "scrambled load targets?"); 888 // Load in argument order going down. 889 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize; 890 // Point to interpreter value (vs. tag) 891 int next_off = ld_off - Interpreter::stackElementSize; 892 // 893 // 894 // 895 VMReg r_1 = regs[i].first(); 896 VMReg r_2 = regs[i].second(); 897 if (!r_1->is_valid()) { 898 assert(!r_2->is_valid(), ""); 899 continue; 900 } 901 if (r_1->is_stack()) { 902 // Convert stack slot to an SP offset (+ wordSize to account for return address ) 903 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize; 904 905 // We can use r13 as a temp here because compiled code doesn't need r13 as an input 906 // and if we end up going thru a c2i because of a miss a reasonable value of r13 907 // will be generated. 908 if (!r_2->is_valid()) { 909 // sign extend??? 910 __ movl(r13, Address(saved_sp, ld_off)); 911 __ movptr(Address(rsp, st_off), r13); 912 } else { 913 // 914 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 915 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 916 // So we must adjust where to pick up the data to match the interpreter. 917 // 918 // Interpreter local[n] == MSW, local[n+1] == LSW however locals 919 // are accessed as negative so LSW is at LOW address 920 921 // ld_off is MSW so get LSW 922 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 923 next_off : ld_off; 924 __ movq(r13, Address(saved_sp, offset)); 925 // st_off is LSW (i.e. reg.first()) 926 __ movq(Address(rsp, st_off), r13); 927 } 928 } else if (r_1->is_Register()) { // Register argument 929 Register r = r_1->as_Register(); 930 assert(r != rax, "must be different"); 931 if (r_2->is_valid()) { 932 // 933 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 934 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 935 // So we must adjust where to pick up the data to match the interpreter. 936 937 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 938 next_off : ld_off; 939 940 // this can be a misaligned move 941 __ movq(r, Address(saved_sp, offset)); 942 } else { 943 // sign extend and use a full word? 944 __ movl(r, Address(saved_sp, ld_off)); 945 } 946 } else { 947 if (!r_2->is_valid()) { 948 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off)); 949 } else { 950 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off)); 951 } 952 } 953 } 954 955 __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about 956 957 // 6243940 We might end up in handle_wrong_method if 958 // the callee is deoptimized as we race thru here. If that 959 // happens we don't want to take a safepoint because the 960 // caller frame will look interpreted and arguments are now 961 // "compiled" so it is much better to make this transition 962 // invisible to the stack walking code. Unfortunately if 963 // we try and find the callee by normal means a safepoint 964 // is possible. So we stash the desired callee in the thread 965 // and the vm will find there should this case occur. 966 967 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx); 968 969 // put Method* where a c2i would expect should we end up there 970 // only needed because eof c2 resolve stubs return Method* as a result in 971 // rax 972 __ mov(rax, rbx); 973 __ jmp(r11); 974 } 975 976 // --------------------------------------------------------------- 977 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm, 978 int total_args_passed, 979 int comp_args_on_stack, 980 const BasicType *sig_bt, 981 const VMRegPair *regs, 982 AdapterFingerPrint* fingerprint) { 983 address i2c_entry = __ pc(); 984 985 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs); 986 987 // ------------------------------------------------------------------------- 988 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls 989 // to the interpreter. The args start out packed in the compiled layout. They 990 // need to be unpacked into the interpreter layout. This will almost always 991 // require some stack space. We grow the current (compiled) stack, then repack 992 // the args. We finally end in a jump to the generic interpreter entry point. 993 // On exit from the interpreter, the interpreter will restore our SP (lest the 994 // compiled code, which relies solely on SP and not RBP, get sick). 995 996 address c2i_unverified_entry = __ pc(); 997 Label skip_fixup; 998 Label ok; 999 1000 Register holder = rax; 1001 Register receiver = j_rarg0; 1002 Register temp = rbx; 1003 1004 { 1005 __ load_klass(temp, receiver, rscratch1); 1006 __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset())); 1007 __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset())); 1008 __ jcc(Assembler::equal, ok); 1009 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1010 1011 __ bind(ok); 1012 // Method might have been compiled since the call site was patched to 1013 // interpreted if that is the case treat it as a miss so we can get 1014 // the call site corrected. 1015 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 1016 __ jcc(Assembler::equal, skip_fixup); 1017 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1018 } 1019 1020 address c2i_entry = __ pc(); 1021 1022 // Class initialization barrier for static methods 1023 address c2i_no_clinit_check_entry = nullptr; 1024 if (VM_Version::supports_fast_class_init_checks()) { 1025 Label L_skip_barrier; 1026 Register method = rbx; 1027 1028 { // Bypass the barrier for non-static methods 1029 Register flags = rscratch1; 1030 __ movl(flags, Address(method, Method::access_flags_offset())); 1031 __ testl(flags, JVM_ACC_STATIC); 1032 __ jcc(Assembler::zero, L_skip_barrier); // non-static 1033 } 1034 1035 Register klass = rscratch1; 1036 __ load_method_holder(klass, method); 1037 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1038 1039 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1040 1041 __ bind(L_skip_barrier); 1042 c2i_no_clinit_check_entry = __ pc(); 1043 } 1044 1045 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1046 bs->c2i_entry_barrier(masm); 1047 1048 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup); 1049 1050 return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry); 1051 } 1052 1053 int SharedRuntime::c_calling_convention(const BasicType *sig_bt, 1054 VMRegPair *regs, 1055 VMRegPair *regs2, 1056 int total_args_passed) { 1057 assert(regs2 == nullptr, "not needed on x86"); 1058 // We return the amount of VMRegImpl stack slots we need to reserve for all 1059 // the arguments NOT counting out_preserve_stack_slots. 1060 1061 // NOTE: These arrays will have to change when c1 is ported 1062 #ifdef _WIN64 1063 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1064 c_rarg0, c_rarg1, c_rarg2, c_rarg3 1065 }; 1066 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1067 c_farg0, c_farg1, c_farg2, c_farg3 1068 }; 1069 #else 1070 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1071 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5 1072 }; 1073 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1074 c_farg0, c_farg1, c_farg2, c_farg3, 1075 c_farg4, c_farg5, c_farg6, c_farg7 1076 }; 1077 #endif // _WIN64 1078 1079 1080 uint int_args = 0; 1081 uint fp_args = 0; 1082 uint stk_args = 0; // inc by 2 each time 1083 1084 for (int i = 0; i < total_args_passed; i++) { 1085 switch (sig_bt[i]) { 1086 case T_BOOLEAN: 1087 case T_CHAR: 1088 case T_BYTE: 1089 case T_SHORT: 1090 case T_INT: 1091 if (int_args < Argument::n_int_register_parameters_c) { 1092 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 1093 #ifdef _WIN64 1094 fp_args++; 1095 // Allocate slots for callee to stuff register args the stack. 1096 stk_args += 2; 1097 #endif 1098 } else { 1099 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1100 stk_args += 2; 1101 } 1102 break; 1103 case T_LONG: 1104 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1105 // fall through 1106 case T_OBJECT: 1107 case T_ARRAY: 1108 case T_ADDRESS: 1109 case T_METADATA: 1110 if (int_args < Argument::n_int_register_parameters_c) { 1111 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 1112 #ifdef _WIN64 1113 fp_args++; 1114 stk_args += 2; 1115 #endif 1116 } else { 1117 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1118 stk_args += 2; 1119 } 1120 break; 1121 case T_FLOAT: 1122 if (fp_args < Argument::n_float_register_parameters_c) { 1123 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 1124 #ifdef _WIN64 1125 int_args++; 1126 // Allocate slots for callee to stuff register args the stack. 1127 stk_args += 2; 1128 #endif 1129 } else { 1130 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1131 stk_args += 2; 1132 } 1133 break; 1134 case T_DOUBLE: 1135 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1136 if (fp_args < Argument::n_float_register_parameters_c) { 1137 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 1138 #ifdef _WIN64 1139 int_args++; 1140 // Allocate slots for callee to stuff register args the stack. 1141 stk_args += 2; 1142 #endif 1143 } else { 1144 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1145 stk_args += 2; 1146 } 1147 break; 1148 case T_VOID: // Halves of longs and doubles 1149 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 1150 regs[i].set_bad(); 1151 break; 1152 default: 1153 ShouldNotReachHere(); 1154 break; 1155 } 1156 } 1157 #ifdef _WIN64 1158 // windows abi requires that we always allocate enough stack space 1159 // for 4 64bit registers to be stored down. 1160 if (stk_args < 8) { 1161 stk_args = 8; 1162 } 1163 #endif // _WIN64 1164 1165 return stk_args; 1166 } 1167 1168 int SharedRuntime::vector_calling_convention(VMRegPair *regs, 1169 uint num_bits, 1170 uint total_args_passed) { 1171 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512, 1172 "only certain vector sizes are supported for now"); 1173 1174 static const XMMRegister VEC_ArgReg[32] = { 1175 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 1176 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1177 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23, 1178 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31 1179 }; 1180 1181 uint stk_args = 0; 1182 uint fp_args = 0; 1183 1184 for (uint i = 0; i < total_args_passed; i++) { 1185 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg(); 1186 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15)); 1187 regs[i].set_pair(vmreg->next(next_val), vmreg); 1188 } 1189 1190 return stk_args; 1191 } 1192 1193 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1194 // We always ignore the frame_slots arg and just use the space just below frame pointer 1195 // which by this time is free to use 1196 switch (ret_type) { 1197 case T_FLOAT: 1198 __ movflt(Address(rbp, -wordSize), xmm0); 1199 break; 1200 case T_DOUBLE: 1201 __ movdbl(Address(rbp, -wordSize), xmm0); 1202 break; 1203 case T_VOID: break; 1204 default: { 1205 __ movptr(Address(rbp, -wordSize), rax); 1206 } 1207 } 1208 } 1209 1210 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1211 // We always ignore the frame_slots arg and just use the space just below frame pointer 1212 // which by this time is free to use 1213 switch (ret_type) { 1214 case T_FLOAT: 1215 __ movflt(xmm0, Address(rbp, -wordSize)); 1216 break; 1217 case T_DOUBLE: 1218 __ movdbl(xmm0, Address(rbp, -wordSize)); 1219 break; 1220 case T_VOID: break; 1221 default: { 1222 __ movptr(rax, Address(rbp, -wordSize)); 1223 } 1224 } 1225 } 1226 1227 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1228 for ( int i = first_arg ; i < arg_count ; i++ ) { 1229 if (args[i].first()->is_Register()) { 1230 __ push(args[i].first()->as_Register()); 1231 } else if (args[i].first()->is_XMMRegister()) { 1232 __ subptr(rsp, 2*wordSize); 1233 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister()); 1234 } 1235 } 1236 } 1237 1238 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1239 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) { 1240 if (args[i].first()->is_Register()) { 1241 __ pop(args[i].first()->as_Register()); 1242 } else if (args[i].first()->is_XMMRegister()) { 1243 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0)); 1244 __ addptr(rsp, 2*wordSize); 1245 } 1246 } 1247 } 1248 1249 static void verify_oop_args(MacroAssembler* masm, 1250 const methodHandle& method, 1251 const BasicType* sig_bt, 1252 const VMRegPair* regs) { 1253 Register temp_reg = rbx; // not part of any compiled calling seq 1254 if (VerifyOops) { 1255 for (int i = 0; i < method->size_of_parameters(); i++) { 1256 if (is_reference_type(sig_bt[i])) { 1257 VMReg r = regs[i].first(); 1258 assert(r->is_valid(), "bad oop arg"); 1259 if (r->is_stack()) { 1260 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1261 __ verify_oop(temp_reg); 1262 } else { 1263 __ verify_oop(r->as_Register()); 1264 } 1265 } 1266 } 1267 } 1268 } 1269 1270 static void check_continuation_enter_argument(VMReg actual_vmreg, 1271 Register expected_reg, 1272 const char* name) { 1273 assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name); 1274 assert(actual_vmreg->as_Register() == expected_reg, 1275 "%s is in unexpected register: %s instead of %s", 1276 name, actual_vmreg->as_Register()->name(), expected_reg->name()); 1277 } 1278 1279 1280 //---------------------------- continuation_enter_setup --------------------------- 1281 // 1282 // Arguments: 1283 // None. 1284 // 1285 // Results: 1286 // rsp: pointer to blank ContinuationEntry 1287 // 1288 // Kills: 1289 // rax 1290 // 1291 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) { 1292 assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, ""); 1293 assert(in_bytes(ContinuationEntry::cont_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1294 assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1295 1296 stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize; 1297 __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1298 1299 int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size; 1300 OopMap* map = new OopMap(frame_size, 0); 1301 1302 __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset())); 1303 __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax); 1304 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp); 1305 1306 return map; 1307 } 1308 1309 //---------------------------- fill_continuation_entry --------------------------- 1310 // 1311 // Arguments: 1312 // rsp: pointer to blank Continuation entry 1313 // reg_cont_obj: pointer to the continuation 1314 // reg_flags: flags 1315 // 1316 // Results: 1317 // rsp: pointer to filled out ContinuationEntry 1318 // 1319 // Kills: 1320 // rax 1321 // 1322 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) { 1323 assert_different_registers(rax, reg_cont_obj, reg_flags); 1324 #ifdef ASSERT 1325 __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value()); 1326 #endif 1327 __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj); 1328 __ movl (Address(rsp, ContinuationEntry::flags_offset()), reg_flags); 1329 __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0); 1330 __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0); 1331 __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0); 1332 1333 __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset())); 1334 __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax); 1335 __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset())); 1336 __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax); 1337 1338 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0); 1339 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0); 1340 } 1341 1342 //---------------------------- continuation_enter_cleanup --------------------------- 1343 // 1344 // Arguments: 1345 // rsp: pointer to the ContinuationEntry 1346 // 1347 // Results: 1348 // rsp: pointer to the spilled rbp in the entry frame 1349 // 1350 // Kills: 1351 // rbx 1352 // 1353 void static continuation_enter_cleanup(MacroAssembler* masm) { 1354 #ifdef ASSERT 1355 Label L_good_sp; 1356 __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1357 __ jcc(Assembler::equal, L_good_sp); 1358 __ stop("Incorrect rsp at continuation_enter_cleanup"); 1359 __ bind(L_good_sp); 1360 #endif 1361 1362 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset())); 1363 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx); 1364 __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset())); 1365 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx); 1366 1367 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset())); 1368 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx); 1369 __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1370 } 1371 1372 static void gen_continuation_enter(MacroAssembler* masm, 1373 const VMRegPair* regs, 1374 int& exception_offset, 1375 OopMapSet* oop_maps, 1376 int& frame_complete, 1377 int& stack_slots, 1378 int& interpreted_entry_offset, 1379 int& compiled_entry_offset) { 1380 1381 // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread) 1382 int pos_cont_obj = 0; 1383 int pos_is_cont = 1; 1384 int pos_is_virtual = 2; 1385 1386 // The platform-specific calling convention may present the arguments in various registers. 1387 // To simplify the rest of the code, we expect the arguments to reside at these known 1388 // registers, and we additionally check the placement here in case calling convention ever 1389 // changes. 1390 Register reg_cont_obj = c_rarg1; 1391 Register reg_is_cont = c_rarg2; 1392 Register reg_is_virtual = c_rarg3; 1393 1394 check_continuation_enter_argument(regs[pos_cont_obj].first(), reg_cont_obj, "Continuation object"); 1395 check_continuation_enter_argument(regs[pos_is_cont].first(), reg_is_cont, "isContinue"); 1396 check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread"); 1397 1398 // Utility methods kill rax, make sure there are no collisions 1399 assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual); 1400 1401 AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(), 1402 relocInfo::static_call_type); 1403 1404 address start = __ pc(); 1405 1406 Label L_thaw, L_exit; 1407 1408 // i2i entry used at interp_only_mode only 1409 interpreted_entry_offset = __ pc() - start; 1410 { 1411 #ifdef ASSERT 1412 Label is_interp_only; 1413 __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0); 1414 __ jcc(Assembler::notEqual, is_interp_only); 1415 __ stop("enterSpecial interpreter entry called when not in interp_only_mode"); 1416 __ bind(is_interp_only); 1417 #endif 1418 1419 __ pop(rax); // return address 1420 // Read interpreter arguments into registers (this is an ad-hoc i2c adapter) 1421 __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2)); 1422 __ movl(c_rarg2, Address(rsp, Interpreter::stackElementSize*1)); 1423 __ movl(c_rarg3, Address(rsp, Interpreter::stackElementSize*0)); 1424 __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment 1425 __ push(rax); // return address 1426 __ push_cont_fastpath(); 1427 1428 __ enter(); 1429 1430 stack_slots = 2; // will be adjusted in setup 1431 OopMap* map = continuation_enter_setup(masm, stack_slots); 1432 // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe, 1433 // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway. 1434 1435 __ verify_oop(reg_cont_obj); 1436 1437 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1438 1439 // If continuation, call to thaw. Otherwise, resolve the call and exit. 1440 __ testptr(reg_is_cont, reg_is_cont); 1441 __ jcc(Assembler::notZero, L_thaw); 1442 1443 // --- Resolve path 1444 1445 // Make sure the call is patchable 1446 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1447 // Emit stub for static call 1448 CodeBuffer* cbuf = masm->code_section()->outer(); 1449 address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, __ pc()); 1450 if (stub == nullptr) { 1451 fatal("CodeCache is full at gen_continuation_enter"); 1452 } 1453 __ call(resolve); 1454 oop_maps->add_gc_map(__ pc() - start, map); 1455 __ post_call_nop(); 1456 1457 __ jmp(L_exit); 1458 } 1459 1460 // compiled entry 1461 __ align(CodeEntryAlignment); 1462 compiled_entry_offset = __ pc() - start; 1463 __ enter(); 1464 1465 stack_slots = 2; // will be adjusted in setup 1466 OopMap* map = continuation_enter_setup(masm, stack_slots); 1467 1468 // Frame is now completed as far as size and linkage. 1469 frame_complete = __ pc() - start; 1470 1471 __ verify_oop(reg_cont_obj); 1472 1473 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1474 1475 // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue) 1476 __ testptr(reg_is_cont, reg_is_cont); 1477 __ jccb(Assembler::notZero, L_thaw); 1478 1479 // --- call Continuation.enter(Continuation c, boolean isContinue) 1480 1481 // Make sure the call is patchable 1482 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1483 1484 // Emit stub for static call 1485 CodeBuffer* cbuf = masm->code_section()->outer(); 1486 address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, __ pc()); 1487 if (stub == nullptr) { 1488 fatal("CodeCache is full at gen_continuation_enter"); 1489 } 1490 1491 // The call needs to be resolved. There's a special case for this in 1492 // SharedRuntime::find_callee_info_helper() which calls 1493 // LinkResolver::resolve_continuation_enter() which resolves the call to 1494 // Continuation.enter(Continuation c, boolean isContinue). 1495 __ call(resolve); 1496 1497 oop_maps->add_gc_map(__ pc() - start, map); 1498 __ post_call_nop(); 1499 1500 __ jmpb(L_exit); 1501 1502 // --- Thawing path 1503 1504 __ bind(L_thaw); 1505 1506 __ call(RuntimeAddress(StubRoutines::cont_thaw())); 1507 1508 ContinuationEntry::_return_pc_offset = __ pc() - start; 1509 oop_maps->add_gc_map(__ pc() - start, map->deep_copy()); 1510 __ post_call_nop(); 1511 1512 // --- Normal exit (resolve/thawing) 1513 1514 __ bind(L_exit); 1515 1516 continuation_enter_cleanup(masm); 1517 __ pop(rbp); 1518 __ ret(0); 1519 1520 // --- Exception handling path 1521 1522 exception_offset = __ pc() - start; 1523 1524 continuation_enter_cleanup(masm); 1525 __ pop(rbp); 1526 1527 __ movptr(c_rarg0, r15_thread); 1528 __ movptr(c_rarg1, Address(rsp, 0)); // return address 1529 1530 // rax still holds the original exception oop, save it before the call 1531 __ push(rax); 1532 1533 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2); 1534 __ movptr(rbx, rax); 1535 1536 // Continue at exception handler: 1537 // rax: exception oop 1538 // rbx: exception handler 1539 // rdx: exception pc 1540 __ pop(rax); 1541 __ verify_oop(rax); 1542 __ pop(rdx); 1543 __ jmp(rbx); 1544 } 1545 1546 static void gen_continuation_yield(MacroAssembler* masm, 1547 const VMRegPair* regs, 1548 OopMapSet* oop_maps, 1549 int& frame_complete, 1550 int& stack_slots, 1551 int& compiled_entry_offset) { 1552 enum layout { 1553 rbp_off, 1554 rbpH_off, 1555 return_off, 1556 return_off2, 1557 framesize // inclusive of return address 1558 }; 1559 stack_slots = framesize / VMRegImpl::slots_per_word; 1560 assert(stack_slots == 2, "recheck layout"); 1561 1562 address start = __ pc(); 1563 compiled_entry_offset = __ pc() - start; 1564 __ enter(); 1565 address the_pc = __ pc(); 1566 1567 frame_complete = the_pc - start; 1568 1569 // This nop must be exactly at the PC we push into the frame info. 1570 // We use this nop for fast CodeBlob lookup, associate the OopMap 1571 // with it right away. 1572 __ post_call_nop(); 1573 OopMap* map = new OopMap(framesize, 1); 1574 oop_maps->add_gc_map(frame_complete, map); 1575 1576 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 1577 __ movptr(c_rarg0, r15_thread); 1578 __ movptr(c_rarg1, rsp); 1579 __ call_VM_leaf(Continuation::freeze_entry(), 2); 1580 __ reset_last_Java_frame(true); 1581 1582 Label L_pinned; 1583 1584 __ testptr(rax, rax); 1585 __ jcc(Assembler::notZero, L_pinned); 1586 1587 __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1588 continuation_enter_cleanup(masm); 1589 __ pop(rbp); 1590 __ ret(0); 1591 1592 __ bind(L_pinned); 1593 1594 // Pinned, return to caller 1595 1596 // handle pending exception thrown by freeze 1597 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 1598 Label ok; 1599 __ jcc(Assembler::equal, ok); 1600 __ leave(); 1601 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 1602 __ bind(ok); 1603 1604 __ leave(); 1605 __ ret(0); 1606 } 1607 1608 static void gen_special_dispatch(MacroAssembler* masm, 1609 const methodHandle& method, 1610 const BasicType* sig_bt, 1611 const VMRegPair* regs) { 1612 verify_oop_args(masm, method, sig_bt, regs); 1613 vmIntrinsics::ID iid = method->intrinsic_id(); 1614 1615 // Now write the args into the outgoing interpreter space 1616 bool has_receiver = false; 1617 Register receiver_reg = noreg; 1618 int member_arg_pos = -1; 1619 Register member_reg = noreg; 1620 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid); 1621 if (ref_kind != 0) { 1622 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument 1623 member_reg = rbx; // known to be free at this point 1624 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind); 1625 } else if (iid == vmIntrinsics::_invokeBasic) { 1626 has_receiver = true; 1627 } else if (iid == vmIntrinsics::_linkToNative) { 1628 member_arg_pos = method->size_of_parameters() - 1; // trailing NativeEntryPoint argument 1629 member_reg = rbx; // known to be free at this point 1630 } else { 1631 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid)); 1632 } 1633 1634 if (member_reg != noreg) { 1635 // Load the member_arg into register, if necessary. 1636 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs); 1637 VMReg r = regs[member_arg_pos].first(); 1638 if (r->is_stack()) { 1639 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1640 } else { 1641 // no data motion is needed 1642 member_reg = r->as_Register(); 1643 } 1644 } 1645 1646 if (has_receiver) { 1647 // Make sure the receiver is loaded into a register. 1648 assert(method->size_of_parameters() > 0, "oob"); 1649 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object"); 1650 VMReg r = regs[0].first(); 1651 assert(r->is_valid(), "bad receiver arg"); 1652 if (r->is_stack()) { 1653 // Porting note: This assumes that compiled calling conventions always 1654 // pass the receiver oop in a register. If this is not true on some 1655 // platform, pick a temp and load the receiver from stack. 1656 fatal("receiver always in a register"); 1657 receiver_reg = j_rarg0; // known to be free at this point 1658 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1659 } else { 1660 // no data motion is needed 1661 receiver_reg = r->as_Register(); 1662 } 1663 } 1664 1665 // Figure out which address we are really jumping to: 1666 MethodHandles::generate_method_handle_dispatch(masm, iid, 1667 receiver_reg, member_reg, /*for_compiler_entry:*/ true); 1668 } 1669 1670 // --------------------------------------------------------------------------- 1671 // Generate a native wrapper for a given method. The method takes arguments 1672 // in the Java compiled code convention, marshals them to the native 1673 // convention (handlizes oops, etc), transitions to native, makes the call, 1674 // returns to java state (possibly blocking), unhandlizes any result and 1675 // returns. 1676 // 1677 // Critical native functions are a shorthand for the use of 1678 // GetPrimtiveArrayCritical and disallow the use of any other JNI 1679 // functions. The wrapper is expected to unpack the arguments before 1680 // passing them to the callee. Critical native functions leave the state _in_Java, 1681 // since they cannot stop for GC. 1682 // Some other parts of JNI setup are skipped like the tear down of the JNI handle 1683 // block and the check for pending exceptions it's impossible for them 1684 // to be thrown. 1685 // 1686 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm, 1687 const methodHandle& method, 1688 int compile_id, 1689 BasicType* in_sig_bt, 1690 VMRegPair* in_regs, 1691 BasicType ret_type) { 1692 if (method->is_continuation_native_intrinsic()) { 1693 int exception_offset = -1; 1694 OopMapSet* oop_maps = new OopMapSet(); 1695 int frame_complete = -1; 1696 int stack_slots = -1; 1697 int interpreted_entry_offset = -1; 1698 int vep_offset = -1; 1699 if (method->is_continuation_enter_intrinsic()) { 1700 gen_continuation_enter(masm, 1701 in_regs, 1702 exception_offset, 1703 oop_maps, 1704 frame_complete, 1705 stack_slots, 1706 interpreted_entry_offset, 1707 vep_offset); 1708 } else if (method->is_continuation_yield_intrinsic()) { 1709 gen_continuation_yield(masm, 1710 in_regs, 1711 oop_maps, 1712 frame_complete, 1713 stack_slots, 1714 vep_offset); 1715 } else { 1716 guarantee(false, "Unknown Continuation native intrinsic"); 1717 } 1718 1719 #ifdef ASSERT 1720 if (method->is_continuation_enter_intrinsic()) { 1721 assert(interpreted_entry_offset != -1, "Must be set"); 1722 assert(exception_offset != -1, "Must be set"); 1723 } else { 1724 assert(interpreted_entry_offset == -1, "Must be unset"); 1725 assert(exception_offset == -1, "Must be unset"); 1726 } 1727 assert(frame_complete != -1, "Must be set"); 1728 assert(stack_slots != -1, "Must be set"); 1729 assert(vep_offset != -1, "Must be set"); 1730 #endif 1731 1732 __ flush(); 1733 nmethod* nm = nmethod::new_native_nmethod(method, 1734 compile_id, 1735 masm->code(), 1736 vep_offset, 1737 frame_complete, 1738 stack_slots, 1739 in_ByteSize(-1), 1740 in_ByteSize(-1), 1741 oop_maps, 1742 exception_offset); 1743 if (method->is_continuation_enter_intrinsic()) { 1744 ContinuationEntry::set_enter_code(nm, interpreted_entry_offset); 1745 } else if (method->is_continuation_yield_intrinsic()) { 1746 _cont_doYield_stub = nm; 1747 } 1748 return nm; 1749 } 1750 1751 if (method->is_method_handle_intrinsic()) { 1752 vmIntrinsics::ID iid = method->intrinsic_id(); 1753 intptr_t start = (intptr_t)__ pc(); 1754 int vep_offset = ((intptr_t)__ pc()) - start; 1755 gen_special_dispatch(masm, 1756 method, 1757 in_sig_bt, 1758 in_regs); 1759 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period 1760 __ flush(); 1761 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually 1762 return nmethod::new_native_nmethod(method, 1763 compile_id, 1764 masm->code(), 1765 vep_offset, 1766 frame_complete, 1767 stack_slots / VMRegImpl::slots_per_word, 1768 in_ByteSize(-1), 1769 in_ByteSize(-1), 1770 nullptr); 1771 } 1772 address native_func = method->native_function(); 1773 assert(native_func != nullptr, "must have function"); 1774 1775 // An OopMap for lock (and class if static) 1776 OopMapSet *oop_maps = new OopMapSet(); 1777 intptr_t start = (intptr_t)__ pc(); 1778 1779 // We have received a description of where all the java arg are located 1780 // on entry to the wrapper. We need to convert these args to where 1781 // the jni function will expect them. To figure out where they go 1782 // we convert the java signature to a C signature by inserting 1783 // the hidden arguments as arg[0] and possibly arg[1] (static method) 1784 1785 const int total_in_args = method->size_of_parameters(); 1786 int total_c_args = total_in_args + (method->is_static() ? 2 : 1); 1787 1788 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args); 1789 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args); 1790 BasicType* in_elem_bt = nullptr; 1791 1792 int argc = 0; 1793 out_sig_bt[argc++] = T_ADDRESS; 1794 if (method->is_static()) { 1795 out_sig_bt[argc++] = T_OBJECT; 1796 } 1797 1798 for (int i = 0; i < total_in_args ; i++ ) { 1799 out_sig_bt[argc++] = in_sig_bt[i]; 1800 } 1801 1802 // Now figure out where the args must be stored and how much stack space 1803 // they require. 1804 int out_arg_slots; 1805 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, nullptr, total_c_args); 1806 1807 // Compute framesize for the wrapper. We need to handlize all oops in 1808 // incoming registers 1809 1810 // Calculate the total number of stack slots we will need. 1811 1812 // First count the abi requirement plus all of the outgoing args 1813 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots; 1814 1815 // Now the space for the inbound oop handle area 1816 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers 1817 1818 int oop_handle_offset = stack_slots; 1819 stack_slots += total_save_slots; 1820 1821 // Now any space we need for handlizing a klass if static method 1822 1823 int klass_slot_offset = 0; 1824 int klass_offset = -1; 1825 int lock_slot_offset = 0; 1826 bool is_static = false; 1827 1828 if (method->is_static()) { 1829 klass_slot_offset = stack_slots; 1830 stack_slots += VMRegImpl::slots_per_word; 1831 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size; 1832 is_static = true; 1833 } 1834 1835 // Plus a lock if needed 1836 1837 if (method->is_synchronized()) { 1838 lock_slot_offset = stack_slots; 1839 stack_slots += VMRegImpl::slots_per_word; 1840 } 1841 1842 // Now a place (+2) to save return values or temp during shuffling 1843 // + 4 for return address (which we own) and saved rbp 1844 stack_slots += 6; 1845 1846 // Ok The space we have allocated will look like: 1847 // 1848 // 1849 // FP-> | | 1850 // |---------------------| 1851 // | 2 slots for moves | 1852 // |---------------------| 1853 // | lock box (if sync) | 1854 // |---------------------| <- lock_slot_offset 1855 // | klass (if static) | 1856 // |---------------------| <- klass_slot_offset 1857 // | oopHandle area | 1858 // |---------------------| <- oop_handle_offset (6 java arg registers) 1859 // | outbound memory | 1860 // | based arguments | 1861 // | | 1862 // |---------------------| 1863 // | | 1864 // SP-> | out_preserved_slots | 1865 // 1866 // 1867 1868 1869 // Now compute actual number of stack words we need rounding to make 1870 // stack properly aligned. 1871 stack_slots = align_up(stack_slots, StackAlignmentInSlots); 1872 1873 int stack_size = stack_slots * VMRegImpl::stack_slot_size; 1874 1875 // First thing make an ic check to see if we should even be here 1876 1877 // We are free to use all registers as temps without saving them and 1878 // restoring them except rbp. rbp is the only callee save register 1879 // as far as the interpreter and the compiler(s) are concerned. 1880 1881 1882 const Register ic_reg = rax; 1883 const Register receiver = j_rarg0; 1884 1885 Label hit; 1886 Label exception_pending; 1887 1888 assert_different_registers(ic_reg, receiver, rscratch1, rscratch2); 1889 __ verify_oop(receiver); 1890 __ load_klass(rscratch1, receiver, rscratch2); 1891 __ cmpq(ic_reg, rscratch1); 1892 __ jcc(Assembler::equal, hit); 1893 1894 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1895 1896 // Verified entry point must be aligned 1897 __ align(8); 1898 1899 __ bind(hit); 1900 1901 int vep_offset = ((intptr_t)__ pc()) - start; 1902 1903 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) { 1904 Label L_skip_barrier; 1905 Register klass = r10; 1906 __ mov_metadata(klass, method->method_holder()); // InstanceKlass* 1907 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1908 1909 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1910 1911 __ bind(L_skip_barrier); 1912 } 1913 1914 #ifdef COMPILER1 1915 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available. 1916 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) { 1917 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/); 1918 } 1919 #endif // COMPILER1 1920 1921 // The instruction at the verified entry point must be 5 bytes or longer 1922 // because it can be patched on the fly by make_non_entrant. The stack bang 1923 // instruction fits that requirement. 1924 1925 // Generate stack overflow check 1926 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size()); 1927 1928 // Generate a new frame for the wrapper. 1929 __ enter(); 1930 // -2 because return address is already present and so is saved rbp 1931 __ subptr(rsp, stack_size - 2*wordSize); 1932 1933 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1934 // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub 1935 bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */); 1936 1937 // Frame is now completed as far as size and linkage. 1938 int frame_complete = ((intptr_t)__ pc()) - start; 1939 1940 if (UseRTMLocking) { 1941 // Abort RTM transaction before calling JNI 1942 // because critical section will be large and will be 1943 // aborted anyway. Also nmethod could be deoptimized. 1944 __ xabort(0); 1945 } 1946 1947 #ifdef ASSERT 1948 __ check_stack_alignment(rsp, "improperly aligned stack"); 1949 #endif /* ASSERT */ 1950 1951 1952 // We use r14 as the oop handle for the receiver/klass 1953 // It is callee save so it survives the call to native 1954 1955 const Register oop_handle_reg = r14; 1956 1957 // 1958 // We immediately shuffle the arguments so that any vm call we have to 1959 // make from here on out (sync slow path, jvmti, etc.) we will have 1960 // captured the oops from our caller and have a valid oopMap for 1961 // them. 1962 1963 // ----------------- 1964 // The Grand Shuffle 1965 1966 // The Java calling convention is either equal (linux) or denser (win64) than the 1967 // c calling convention. However the because of the jni_env argument the c calling 1968 // convention always has at least one more (and two for static) arguments than Java. 1969 // Therefore if we move the args from java -> c backwards then we will never have 1970 // a register->register conflict and we don't have to build a dependency graph 1971 // and figure out how to break any cycles. 1972 // 1973 1974 // Record esp-based slot for receiver on stack for non-static methods 1975 int receiver_offset = -1; 1976 1977 // This is a trick. We double the stack slots so we can claim 1978 // the oops in the caller's frame. Since we are sure to have 1979 // more args than the caller doubling is enough to make 1980 // sure we can capture all the incoming oop args from the 1981 // caller. 1982 // 1983 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/); 1984 1985 // Mark location of rbp (someday) 1986 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp)); 1987 1988 // Use eax, ebx as temporaries during any memory-memory moves we have to do 1989 // All inbound args are referenced based on rbp and all outbound args via rsp. 1990 1991 1992 #ifdef ASSERT 1993 bool reg_destroyed[Register::number_of_registers]; 1994 bool freg_destroyed[XMMRegister::number_of_registers]; 1995 for ( int r = 0 ; r < Register::number_of_registers ; r++ ) { 1996 reg_destroyed[r] = false; 1997 } 1998 for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) { 1999 freg_destroyed[f] = false; 2000 } 2001 2002 #endif /* ASSERT */ 2003 2004 // For JNI natives the incoming and outgoing registers are offset upwards. 2005 GrowableArray<int> arg_order(2 * total_in_args); 2006 2007 VMRegPair tmp_vmreg; 2008 tmp_vmreg.set2(rbx->as_VMReg()); 2009 2010 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 2011 arg_order.push(i); 2012 arg_order.push(c_arg); 2013 } 2014 2015 int temploc = -1; 2016 for (int ai = 0; ai < arg_order.length(); ai += 2) { 2017 int i = arg_order.at(ai); 2018 int c_arg = arg_order.at(ai + 1); 2019 __ block_comment(err_msg("move %d -> %d", i, c_arg)); 2020 #ifdef ASSERT 2021 if (in_regs[i].first()->is_Register()) { 2022 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!"); 2023 } else if (in_regs[i].first()->is_XMMRegister()) { 2024 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!"); 2025 } 2026 if (out_regs[c_arg].first()->is_Register()) { 2027 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 2028 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 2029 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 2030 } 2031 #endif /* ASSERT */ 2032 switch (in_sig_bt[i]) { 2033 case T_ARRAY: 2034 case T_OBJECT: 2035 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg], 2036 ((i == 0) && (!is_static)), 2037 &receiver_offset); 2038 break; 2039 case T_VOID: 2040 break; 2041 2042 case T_FLOAT: 2043 __ float_move(in_regs[i], out_regs[c_arg]); 2044 break; 2045 2046 case T_DOUBLE: 2047 assert( i + 1 < total_in_args && 2048 in_sig_bt[i + 1] == T_VOID && 2049 out_sig_bt[c_arg+1] == T_VOID, "bad arg list"); 2050 __ double_move(in_regs[i], out_regs[c_arg]); 2051 break; 2052 2053 case T_LONG : 2054 __ long_move(in_regs[i], out_regs[c_arg]); 2055 break; 2056 2057 case T_ADDRESS: assert(false, "found T_ADDRESS in java args"); 2058 2059 default: 2060 __ move32_64(in_regs[i], out_regs[c_arg]); 2061 } 2062 } 2063 2064 int c_arg; 2065 2066 // Pre-load a static method's oop into r14. Used both by locking code and 2067 // the normal JNI call code. 2068 // point c_arg at the first arg that is already loaded in case we 2069 // need to spill before we call out 2070 c_arg = total_c_args - total_in_args; 2071 2072 if (method->is_static()) { 2073 2074 // load oop into a register 2075 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror())); 2076 2077 // Now handlize the static class mirror it's known not-null. 2078 __ movptr(Address(rsp, klass_offset), oop_handle_reg); 2079 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset)); 2080 2081 // Now get the handle 2082 __ lea(oop_handle_reg, Address(rsp, klass_offset)); 2083 // store the klass handle as second argument 2084 __ movptr(c_rarg1, oop_handle_reg); 2085 // and protect the arg if we must spill 2086 c_arg--; 2087 } 2088 2089 // Change state to native (we save the return address in the thread, since it might not 2090 // be pushed on the stack when we do a stack traversal). It is enough that the pc() 2091 // points into the right code segment. It does not have to be the correct return pc. 2092 // We use the same pc/oopMap repeatedly when we call out 2093 2094 intptr_t the_pc = (intptr_t) __ pc(); 2095 oop_maps->add_gc_map(the_pc - start, map); 2096 2097 __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1); 2098 2099 2100 // We have all of the arguments setup at this point. We must not touch any register 2101 // argument registers at this point (what if we save/restore them there are no oop? 2102 2103 { 2104 SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1); 2105 // protect the args we've loaded 2106 save_args(masm, total_c_args, c_arg, out_regs); 2107 __ mov_metadata(c_rarg1, method()); 2108 __ call_VM_leaf( 2109 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry), 2110 r15_thread, c_rarg1); 2111 restore_args(masm, total_c_args, c_arg, out_regs); 2112 } 2113 2114 // RedefineClasses() tracing support for obsolete method entry 2115 if (log_is_enabled(Trace, redefine, class, obsolete)) { 2116 // protect the args we've loaded 2117 save_args(masm, total_c_args, c_arg, out_regs); 2118 __ mov_metadata(c_rarg1, method()); 2119 __ call_VM_leaf( 2120 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry), 2121 r15_thread, c_rarg1); 2122 restore_args(masm, total_c_args, c_arg, out_regs); 2123 } 2124 2125 // Lock a synchronized method 2126 2127 // Register definitions used by locking and unlocking 2128 2129 const Register swap_reg = rax; // Must use rax for cmpxchg instruction 2130 const Register obj_reg = rbx; // Will contain the oop 2131 const Register lock_reg = r13; // Address of compiler lock object (BasicLock) 2132 const Register old_hdr = r13; // value of old header at unlock time 2133 2134 Label slow_path_lock; 2135 Label lock_done; 2136 2137 if (method->is_synchronized()) { 2138 Label count_mon; 2139 2140 const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes(); 2141 2142 // Get the handle (the 2nd argument) 2143 __ mov(oop_handle_reg, c_rarg1); 2144 2145 // Get address of the box 2146 2147 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2148 2149 // Load the oop from the handle 2150 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2151 2152 if (LockingMode == LM_MONITOR) { 2153 __ jmp(slow_path_lock); 2154 } else if (LockingMode == LM_LEGACY) { 2155 // Load immediate 1 into swap_reg %rax 2156 __ movl(swap_reg, 1); 2157 2158 // Load (object->mark() | 1) into swap_reg %rax 2159 __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2160 2161 // Save (object->mark() | 1) into BasicLock's displaced header 2162 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2163 2164 // src -> dest iff dest == rax else rax <- dest 2165 __ lock(); 2166 __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2167 __ jcc(Assembler::equal, count_mon); 2168 2169 // Hmm should this move to the slow path code area??? 2170 2171 // Test if the oopMark is an obvious stack pointer, i.e., 2172 // 1) (mark & 3) == 0, and 2173 // 2) rsp <= mark < mark + os::pagesize() 2174 // These 3 tests can be done by evaluating the following 2175 // expression: ((mark - rsp) & (3 - os::vm_page_size())), 2176 // assuming both stack pointer and pagesize have their 2177 // least significant 2 bits clear. 2178 // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg 2179 2180 __ subptr(swap_reg, rsp); 2181 __ andptr(swap_reg, 3 - (int)os::vm_page_size()); 2182 2183 // Save the test result, for recursive case, the result is zero 2184 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2185 __ jcc(Assembler::notEqual, slow_path_lock); 2186 } else { 2187 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2188 __ lightweight_lock(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock); 2189 } 2190 __ bind(count_mon); 2191 __ inc_held_monitor_count(); 2192 2193 // Slow path will re-enter here 2194 __ bind(lock_done); 2195 } 2196 2197 // Finally just about ready to make the JNI call 2198 2199 // get JNIEnv* which is first argument to native 2200 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset()))); 2201 2202 // Now set thread in native 2203 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 2204 2205 __ call(RuntimeAddress(native_func)); 2206 2207 // Verify or restore cpu control state after JNI call 2208 __ restore_cpu_control_state_after_jni(rscratch1); 2209 2210 // Unpack native results. 2211 switch (ret_type) { 2212 case T_BOOLEAN: __ c2bool(rax); break; 2213 case T_CHAR : __ movzwl(rax, rax); break; 2214 case T_BYTE : __ sign_extend_byte (rax); break; 2215 case T_SHORT : __ sign_extend_short(rax); break; 2216 case T_INT : /* nothing to do */ break; 2217 case T_DOUBLE : 2218 case T_FLOAT : 2219 // Result is in xmm0 we'll save as needed 2220 break; 2221 case T_ARRAY: // Really a handle 2222 case T_OBJECT: // Really a handle 2223 break; // can't de-handlize until after safepoint check 2224 case T_VOID: break; 2225 case T_LONG: break; 2226 default : ShouldNotReachHere(); 2227 } 2228 2229 Label after_transition; 2230 2231 // Switch thread to "native transition" state before reading the synchronization state. 2232 // This additional state is necessary because reading and testing the synchronization 2233 // state is not atomic w.r.t. GC, as this scenario demonstrates: 2234 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted. 2235 // VM thread changes sync state to synchronizing and suspends threads for GC. 2236 // Thread A is resumed to finish this native method, but doesn't block here since it 2237 // didn't see any synchronization is progress, and escapes. 2238 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 2239 2240 // Force this write out before the read below 2241 if (!UseSystemMemoryBarrier) { 2242 __ membar(Assembler::Membar_mask_bits( 2243 Assembler::LoadLoad | Assembler::LoadStore | 2244 Assembler::StoreLoad | Assembler::StoreStore)); 2245 } 2246 2247 // check for safepoint operation in progress and/or pending suspend requests 2248 { 2249 Label Continue; 2250 Label slow_path; 2251 2252 __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */); 2253 2254 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2255 __ jcc(Assembler::equal, Continue); 2256 __ bind(slow_path); 2257 2258 // Don't use call_VM as it will see a possible pending exception and forward it 2259 // and never return here preventing us from clearing _last_native_pc down below. 2260 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are 2261 // preserved and correspond to the bcp/locals pointers. So we do a runtime call 2262 // by hand. 2263 // 2264 __ vzeroupper(); 2265 save_native_result(masm, ret_type, stack_slots); 2266 __ mov(c_rarg0, r15_thread); 2267 __ mov(r12, rsp); // remember sp 2268 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2269 __ andptr(rsp, -16); // align stack as required by ABI 2270 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 2271 __ mov(rsp, r12); // restore sp 2272 __ reinit_heapbase(); 2273 // Restore any method result value 2274 restore_native_result(masm, ret_type, stack_slots); 2275 __ bind(Continue); 2276 } 2277 2278 // change thread state 2279 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 2280 __ bind(after_transition); 2281 2282 Label reguard; 2283 Label reguard_done; 2284 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 2285 __ jcc(Assembler::equal, reguard); 2286 __ bind(reguard_done); 2287 2288 // native result if any is live 2289 2290 // Unlock 2291 Label slow_path_unlock; 2292 Label unlock_done; 2293 if (method->is_synchronized()) { 2294 2295 Label fast_done; 2296 2297 // Get locked oop from the handle we passed to jni 2298 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2299 2300 if (LockingMode == LM_LEGACY) { 2301 Label not_recur; 2302 // Simple recursive lock? 2303 __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD); 2304 __ jcc(Assembler::notEqual, not_recur); 2305 __ dec_held_monitor_count(); 2306 __ jmpb(fast_done); 2307 __ bind(not_recur); 2308 } 2309 2310 // Must save rax if it is live now because cmpxchg must use it 2311 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2312 save_native_result(masm, ret_type, stack_slots); 2313 } 2314 2315 if (LockingMode == LM_MONITOR) { 2316 __ jmp(slow_path_unlock); 2317 } else if (LockingMode == LM_LEGACY) { 2318 // get address of the stack lock 2319 __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2320 // get old displaced header 2321 __ movptr(old_hdr, Address(rax, 0)); 2322 2323 // Atomic swap old header if oop still contains the stack lock 2324 __ lock(); 2325 __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2326 __ jcc(Assembler::notEqual, slow_path_unlock); 2327 __ dec_held_monitor_count(); 2328 } else { 2329 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2330 __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock); 2331 __ dec_held_monitor_count(); 2332 } 2333 2334 // slow path re-enters here 2335 __ bind(unlock_done); 2336 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2337 restore_native_result(masm, ret_type, stack_slots); 2338 } 2339 2340 __ bind(fast_done); 2341 } 2342 { 2343 SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1); 2344 save_native_result(masm, ret_type, stack_slots); 2345 __ mov_metadata(c_rarg1, method()); 2346 __ call_VM_leaf( 2347 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit), 2348 r15_thread, c_rarg1); 2349 restore_native_result(masm, ret_type, stack_slots); 2350 } 2351 2352 __ reset_last_Java_frame(false); 2353 2354 // Unbox oop result, e.g. JNIHandles::resolve value. 2355 if (is_reference_type(ret_type)) { 2356 __ resolve_jobject(rax /* value */, 2357 r15_thread /* thread */, 2358 rcx /* tmp */); 2359 } 2360 2361 if (CheckJNICalls) { 2362 // clear_pending_jni_exception_check 2363 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD); 2364 } 2365 2366 // reset handle block 2367 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset())); 2368 __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD); 2369 2370 // pop our frame 2371 2372 __ leave(); 2373 2374 // Any exception pending? 2375 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2376 __ jcc(Assembler::notEqual, exception_pending); 2377 2378 // Return 2379 2380 __ ret(0); 2381 2382 // Unexpected paths are out of line and go here 2383 2384 // forward the exception 2385 __ bind(exception_pending); 2386 2387 // and forward the exception 2388 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 2389 2390 // Slow path locking & unlocking 2391 if (method->is_synchronized()) { 2392 2393 // BEGIN Slow path lock 2394 __ bind(slow_path_lock); 2395 2396 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM 2397 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2398 2399 // protect the args we've loaded 2400 save_args(masm, total_c_args, c_arg, out_regs); 2401 2402 __ mov(c_rarg0, obj_reg); 2403 __ mov(c_rarg1, lock_reg); 2404 __ mov(c_rarg2, r15_thread); 2405 2406 // Not a leaf but we have last_Java_frame setup as we want 2407 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3); 2408 restore_args(masm, total_c_args, c_arg, out_regs); 2409 2410 #ifdef ASSERT 2411 { Label L; 2412 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2413 __ jcc(Assembler::equal, L); 2414 __ stop("no pending exception allowed on exit from monitorenter"); 2415 __ bind(L); 2416 } 2417 #endif 2418 __ jmp(lock_done); 2419 2420 // END Slow path lock 2421 2422 // BEGIN Slow path unlock 2423 __ bind(slow_path_unlock); 2424 2425 // If we haven't already saved the native result we must save it now as xmm registers 2426 // are still exposed. 2427 __ vzeroupper(); 2428 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2429 save_native_result(masm, ret_type, stack_slots); 2430 } 2431 2432 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2433 2434 __ mov(c_rarg0, obj_reg); 2435 __ mov(c_rarg2, r15_thread); 2436 __ mov(r12, rsp); // remember sp 2437 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2438 __ andptr(rsp, -16); // align stack as required by ABI 2439 2440 // Save pending exception around call to VM (which contains an EXCEPTION_MARK) 2441 // NOTE that obj_reg == rbx currently 2442 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset()))); 2443 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2444 2445 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2446 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C))); 2447 __ mov(rsp, r12); // restore sp 2448 __ reinit_heapbase(); 2449 #ifdef ASSERT 2450 { 2451 Label L; 2452 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2453 __ jcc(Assembler::equal, L); 2454 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C"); 2455 __ bind(L); 2456 } 2457 #endif /* ASSERT */ 2458 2459 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx); 2460 2461 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2462 restore_native_result(masm, ret_type, stack_slots); 2463 } 2464 __ jmp(unlock_done); 2465 2466 // END Slow path unlock 2467 2468 } // synchronized 2469 2470 // SLOW PATH Reguard the stack if needed 2471 2472 __ bind(reguard); 2473 __ vzeroupper(); 2474 save_native_result(masm, ret_type, stack_slots); 2475 __ mov(r12, rsp); // remember sp 2476 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2477 __ andptr(rsp, -16); // align stack as required by ABI 2478 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 2479 __ mov(rsp, r12); // restore sp 2480 __ reinit_heapbase(); 2481 restore_native_result(masm, ret_type, stack_slots); 2482 // and continue 2483 __ jmp(reguard_done); 2484 2485 2486 2487 __ flush(); 2488 2489 nmethod *nm = nmethod::new_native_nmethod(method, 2490 compile_id, 2491 masm->code(), 2492 vep_offset, 2493 frame_complete, 2494 stack_slots / VMRegImpl::slots_per_word, 2495 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)), 2496 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size), 2497 oop_maps); 2498 2499 return nm; 2500 } 2501 2502 // this function returns the adjust size (in number of words) to a c2i adapter 2503 // activation for use during deoptimization 2504 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) { 2505 return (callee_locals - callee_parameters) * Interpreter::stackElementWords; 2506 } 2507 2508 2509 uint SharedRuntime::out_preserve_stack_slots() { 2510 return 0; 2511 } 2512 2513 2514 // Number of stack slots between incoming argument block and the start of 2515 // a new frame. The PROLOG must add this many slots to the stack. The 2516 // EPILOG must remove this many slots. amd64 needs two slots for 2517 // return address. 2518 uint SharedRuntime::in_preserve_stack_slots() { 2519 return 4 + 2 * VerifyStackAtCalls; 2520 } 2521 2522 //------------------------------generate_deopt_blob---------------------------- 2523 void SharedRuntime::generate_deopt_blob() { 2524 // Allocate space for the code 2525 ResourceMark rm; 2526 // Setup code generation tools 2527 int pad = 0; 2528 if (UseAVX > 2) { 2529 pad += 1024; 2530 } 2531 #if INCLUDE_JVMCI 2532 if (EnableJVMCI) { 2533 pad += 512; // Increase the buffer size when compiling for JVMCI 2534 } 2535 #endif 2536 CodeBuffer buffer("deopt_blob", 2560+pad, 1024); 2537 MacroAssembler* masm = new MacroAssembler(&buffer); 2538 int frame_size_in_words; 2539 OopMap* map = nullptr; 2540 OopMapSet *oop_maps = new OopMapSet(); 2541 2542 // ------------- 2543 // This code enters when returning to a de-optimized nmethod. A return 2544 // address has been pushed on the stack, and return values are in 2545 // registers. 2546 // If we are doing a normal deopt then we were called from the patched 2547 // nmethod from the point we returned to the nmethod. So the return 2548 // address on the stack is wrong by NativeCall::instruction_size 2549 // We will adjust the value so it looks like we have the original return 2550 // address on the stack (like when we eagerly deoptimized). 2551 // In the case of an exception pending when deoptimizing, we enter 2552 // with a return address on the stack that points after the call we patched 2553 // into the exception handler. We have the following register state from, 2554 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp). 2555 // rax: exception oop 2556 // rbx: exception handler 2557 // rdx: throwing pc 2558 // So in this case we simply jam rdx into the useless return address and 2559 // the stack looks just like we want. 2560 // 2561 // At this point we need to de-opt. We save the argument return 2562 // registers. We call the first C routine, fetch_unroll_info(). This 2563 // routine captures the return values and returns a structure which 2564 // describes the current frame size and the sizes of all replacement frames. 2565 // The current frame is compiled code and may contain many inlined 2566 // functions, each with their own JVM state. We pop the current frame, then 2567 // push all the new frames. Then we call the C routine unpack_frames() to 2568 // populate these frames. Finally unpack_frames() returns us the new target 2569 // address. Notice that callee-save registers are BLOWN here; they have 2570 // already been captured in the vframeArray at the time the return PC was 2571 // patched. 2572 address start = __ pc(); 2573 Label cont; 2574 2575 // Prolog for non exception case! 2576 2577 // Save everything in sight. 2578 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2579 2580 // Normal deoptimization. Save exec mode for unpack_frames. 2581 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved 2582 __ jmp(cont); 2583 2584 int reexecute_offset = __ pc() - start; 2585 #if INCLUDE_JVMCI && !defined(COMPILER1) 2586 if (EnableJVMCI && UseJVMCICompiler) { 2587 // JVMCI does not use this kind of deoptimization 2588 __ should_not_reach_here(); 2589 } 2590 #endif 2591 2592 // Reexecute case 2593 // return address is the pc describes what bci to do re-execute at 2594 2595 // No need to update map as each call to save_live_registers will produce identical oopmap 2596 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2597 2598 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved 2599 __ jmp(cont); 2600 2601 #if INCLUDE_JVMCI 2602 Label after_fetch_unroll_info_call; 2603 int implicit_exception_uncommon_trap_offset = 0; 2604 int uncommon_trap_offset = 0; 2605 2606 if (EnableJVMCI) { 2607 implicit_exception_uncommon_trap_offset = __ pc() - start; 2608 2609 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset()))); 2610 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD); 2611 2612 uncommon_trap_offset = __ pc() - start; 2613 2614 // Save everything in sight. 2615 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2616 // fetch_unroll_info needs to call last_java_frame() 2617 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2618 2619 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset()))); 2620 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1); 2621 2622 __ movl(r14, Deoptimization::Unpack_reexecute); 2623 __ mov(c_rarg0, r15_thread); 2624 __ movl(c_rarg2, r14); // exec mode 2625 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2626 oop_maps->add_gc_map( __ pc()-start, map->deep_copy()); 2627 2628 __ reset_last_Java_frame(false); 2629 2630 __ jmp(after_fetch_unroll_info_call); 2631 } // EnableJVMCI 2632 #endif // INCLUDE_JVMCI 2633 2634 int exception_offset = __ pc() - start; 2635 2636 // Prolog for exception case 2637 2638 // all registers are dead at this entry point, except for rax, and 2639 // rdx which contain the exception oop and exception pc 2640 // respectively. Set them in TLS and fall thru to the 2641 // unpack_with_exception_in_tls entry point. 2642 2643 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 2644 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax); 2645 2646 int exception_in_tls_offset = __ pc() - start; 2647 2648 // new implementation because exception oop is now passed in JavaThread 2649 2650 // Prolog for exception case 2651 // All registers must be preserved because they might be used by LinearScan 2652 // Exceptiop oop and throwing PC are passed in JavaThread 2653 // tos: stack at point of call to method that threw the exception (i.e. only 2654 // args are on the stack, no return address) 2655 2656 // make room on stack for the return address 2657 // It will be patched later with the throwing pc. The correct value is not 2658 // available now because loading it from memory would destroy registers. 2659 __ push(0); 2660 2661 // Save everything in sight. 2662 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2663 2664 // Now it is safe to overwrite any register 2665 2666 // Deopt during an exception. Save exec mode for unpack_frames. 2667 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved 2668 2669 // load throwing pc from JavaThread and patch it as the return address 2670 // of the current frame. Then clear the field in JavaThread 2671 2672 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2673 __ movptr(Address(rbp, wordSize), rdx); 2674 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2675 2676 #ifdef ASSERT 2677 // verify that there is really an exception oop in JavaThread 2678 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2679 __ verify_oop(rax); 2680 2681 // verify that there is no pending exception 2682 Label no_pending_exception; 2683 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 2684 __ testptr(rax, rax); 2685 __ jcc(Assembler::zero, no_pending_exception); 2686 __ stop("must not have pending exception here"); 2687 __ bind(no_pending_exception); 2688 #endif 2689 2690 __ bind(cont); 2691 2692 // Call C code. Need thread and this frame, but NOT official VM entry 2693 // crud. We cannot block on this call, no GC can happen. 2694 // 2695 // UnrollBlock* fetch_unroll_info(JavaThread* thread) 2696 2697 // fetch_unroll_info needs to call last_java_frame(). 2698 2699 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2700 #ifdef ASSERT 2701 { Label L; 2702 __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD); 2703 __ jcc(Assembler::equal, L); 2704 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared"); 2705 __ bind(L); 2706 } 2707 #endif // ASSERT 2708 __ mov(c_rarg0, r15_thread); 2709 __ movl(c_rarg1, r14); // exec_mode 2710 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info))); 2711 2712 // Need to have an oopmap that tells fetch_unroll_info where to 2713 // find any register it might need. 2714 oop_maps->add_gc_map(__ pc() - start, map); 2715 2716 __ reset_last_Java_frame(false); 2717 2718 #if INCLUDE_JVMCI 2719 if (EnableJVMCI) { 2720 __ bind(after_fetch_unroll_info_call); 2721 } 2722 #endif 2723 2724 // Load UnrollBlock* into rdi 2725 __ mov(rdi, rax); 2726 2727 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset())); 2728 Label noException; 2729 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending? 2730 __ jcc(Assembler::notEqual, noException); 2731 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2732 // QQQ this is useless it was null above 2733 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2734 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD); 2735 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2736 2737 __ verify_oop(rax); 2738 2739 // Overwrite the result registers with the exception results. 2740 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2741 // I think this is useless 2742 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx); 2743 2744 __ bind(noException); 2745 2746 // Only register save data is on the stack. 2747 // Now restore the result registers. Everything else is either dead 2748 // or captured in the vframeArray. 2749 RegisterSaver::restore_result_registers(masm); 2750 2751 // All of the register save area has been popped of the stack. Only the 2752 // return address remains. 2753 2754 // Pop all the frames we must move/replace. 2755 // 2756 // Frame picture (youngest to oldest) 2757 // 1: self-frame (no frame link) 2758 // 2: deopting frame (no frame link) 2759 // 3: caller of deopting frame (could be compiled/interpreted). 2760 // 2761 // Note: by leaving the return address of self-frame on the stack 2762 // and using the size of frame 2 to adjust the stack 2763 // when we are done the return to frame 3 will still be on the stack. 2764 2765 // Pop deoptimized frame 2766 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset())); 2767 __ addptr(rsp, rcx); 2768 2769 // rsp should be pointing at the return address to the caller (3) 2770 2771 // Pick up the initial fp we should save 2772 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2773 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset())); 2774 2775 #ifdef ASSERT 2776 // Compilers generate code that bang the stack by as much as the 2777 // interpreter would need. So this stack banging should never 2778 // trigger a fault. Verify that it does not on non product builds. 2779 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset())); 2780 __ bang_stack_size(rbx, rcx); 2781 #endif 2782 2783 // Load address of array of frame pcs into rcx 2784 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset())); 2785 2786 // Trash the old pc 2787 __ addptr(rsp, wordSize); 2788 2789 // Load address of array of frame sizes into rsi 2790 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset())); 2791 2792 // Load counter into rdx 2793 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset())); 2794 2795 // Now adjust the caller's stack to make up for the extra locals 2796 // but record the original sp so that we can save it in the skeletal interpreter 2797 // frame and the stack walking of interpreter_sender will get the unextended sp 2798 // value and not the "real" sp value. 2799 2800 const Register sender_sp = r8; 2801 2802 __ mov(sender_sp, rsp); 2803 __ movl(rbx, Address(rdi, 2804 Deoptimization::UnrollBlock:: 2805 caller_adjustment_offset())); 2806 __ subptr(rsp, rbx); 2807 2808 // Push interpreter frames in a loop 2809 Label loop; 2810 __ bind(loop); 2811 __ movptr(rbx, Address(rsi, 0)); // Load frame size 2812 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand 2813 __ pushptr(Address(rcx, 0)); // Save return address 2814 __ enter(); // Save old & set new ebp 2815 __ subptr(rsp, rbx); // Prolog 2816 // This value is corrected by layout_activation_impl 2817 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD); 2818 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable 2819 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 2820 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 2821 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 2822 __ decrementl(rdx); // Decrement counter 2823 __ jcc(Assembler::notZero, loop); 2824 __ pushptr(Address(rcx, 0)); // Save final return address 2825 2826 // Re-push self-frame 2827 __ enter(); // Save old & set new ebp 2828 2829 // Allocate a full sized register save area. 2830 // Return address and rbp are in place, so we allocate two less words. 2831 __ subptr(rsp, (frame_size_in_words - 2) * wordSize); 2832 2833 // Restore frame locals after moving the frame 2834 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0); 2835 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2836 2837 // Call C code. Need thread but NOT official VM entry 2838 // crud. We cannot block on this call, no GC can happen. Call should 2839 // restore return values to their stack-slots with the new SP. 2840 // 2841 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode) 2842 2843 // Use rbp because the frames look interpreted now 2844 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 2845 // Don't need the precise return PC here, just precise enough to point into this code blob. 2846 address the_pc = __ pc(); 2847 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1); 2848 2849 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI 2850 __ mov(c_rarg0, r15_thread); 2851 __ movl(c_rarg1, r14); // second arg: exec_mode 2852 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 2853 // Revert SP alignment after call since we're going to do some SP relative addressing below 2854 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset())); 2855 2856 // Set an oopmap for the call site 2857 // Use the same PC we used for the last java frame 2858 oop_maps->add_gc_map(the_pc - start, 2859 new OopMap( frame_size_in_words, 0 )); 2860 2861 // Clear fp AND pc 2862 __ reset_last_Java_frame(true); 2863 2864 // Collect return values 2865 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes())); 2866 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes())); 2867 // I think this is useless (throwing pc?) 2868 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes())); 2869 2870 // Pop self-frame. 2871 __ leave(); // Epilog 2872 2873 // Jump to interpreter 2874 __ ret(0); 2875 2876 // Make sure all code is generated 2877 masm->flush(); 2878 2879 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words); 2880 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset); 2881 #if INCLUDE_JVMCI 2882 if (EnableJVMCI) { 2883 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset); 2884 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset); 2885 } 2886 #endif 2887 } 2888 2889 #ifdef COMPILER2 2890 //------------------------------generate_uncommon_trap_blob-------------------- 2891 void SharedRuntime::generate_uncommon_trap_blob() { 2892 // Allocate space for the code 2893 ResourceMark rm; 2894 // Setup code generation tools 2895 CodeBuffer buffer("uncommon_trap_blob", 2048, 1024); 2896 MacroAssembler* masm = new MacroAssembler(&buffer); 2897 2898 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 2899 2900 address start = __ pc(); 2901 2902 if (UseRTMLocking) { 2903 // Abort RTM transaction before possible nmethod deoptimization. 2904 __ xabort(0); 2905 } 2906 2907 // Push self-frame. We get here with a return address on the 2908 // stack, so rsp is 8-byte aligned until we allocate our frame. 2909 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog! 2910 2911 // No callee saved registers. rbp is assumed implicitly saved 2912 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 2913 2914 // compiler left unloaded_class_index in j_rarg0 move to where the 2915 // runtime expects it. 2916 __ movl(c_rarg1, j_rarg0); 2917 2918 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2919 2920 // Call C code. Need thread but NOT official VM entry 2921 // crud. We cannot block on this call, no GC can happen. Call should 2922 // capture callee-saved registers as well as return values. 2923 // Thread is in rdi already. 2924 // 2925 // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index); 2926 2927 __ mov(c_rarg0, r15_thread); 2928 __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap); 2929 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2930 2931 // Set an oopmap for the call site 2932 OopMapSet* oop_maps = new OopMapSet(); 2933 OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0); 2934 2935 // location of rbp is known implicitly by the frame sender code 2936 2937 oop_maps->add_gc_map(__ pc() - start, map); 2938 2939 __ reset_last_Java_frame(false); 2940 2941 // Load UnrollBlock* into rdi 2942 __ mov(rdi, rax); 2943 2944 #ifdef ASSERT 2945 { Label L; 2946 __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()), 2947 Deoptimization::Unpack_uncommon_trap); 2948 __ jcc(Assembler::equal, L); 2949 __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap"); 2950 __ bind(L); 2951 } 2952 #endif 2953 2954 // Pop all the frames we must move/replace. 2955 // 2956 // Frame picture (youngest to oldest) 2957 // 1: self-frame (no frame link) 2958 // 2: deopting frame (no frame link) 2959 // 3: caller of deopting frame (could be compiled/interpreted). 2960 2961 // Pop self-frame. We have no frame, and must rely only on rax and rsp. 2962 __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog! 2963 2964 // Pop deoptimized frame (int) 2965 __ movl(rcx, Address(rdi, 2966 Deoptimization::UnrollBlock:: 2967 size_of_deoptimized_frame_offset())); 2968 __ addptr(rsp, rcx); 2969 2970 // rsp should be pointing at the return address to the caller (3) 2971 2972 // Pick up the initial fp we should save 2973 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2974 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset())); 2975 2976 #ifdef ASSERT 2977 // Compilers generate code that bang the stack by as much as the 2978 // interpreter would need. So this stack banging should never 2979 // trigger a fault. Verify that it does not on non product builds. 2980 __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset())); 2981 __ bang_stack_size(rbx, rcx); 2982 #endif 2983 2984 // Load address of array of frame pcs into rcx (address*) 2985 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset())); 2986 2987 // Trash the return pc 2988 __ addptr(rsp, wordSize); 2989 2990 // Load address of array of frame sizes into rsi (intptr_t*) 2991 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset())); 2992 2993 // Counter 2994 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset())); // (int) 2995 2996 // Now adjust the caller's stack to make up for the extra locals but 2997 // record the original sp so that we can save it in the skeletal 2998 // interpreter frame and the stack walking of interpreter_sender 2999 // will get the unextended sp value and not the "real" sp value. 3000 3001 const Register sender_sp = r8; 3002 3003 __ mov(sender_sp, rsp); 3004 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset())); // (int) 3005 __ subptr(rsp, rbx); 3006 3007 // Push interpreter frames in a loop 3008 Label loop; 3009 __ bind(loop); 3010 __ movptr(rbx, Address(rsi, 0)); // Load frame size 3011 __ subptr(rbx, 2 * wordSize); // We'll push pc and rbp by hand 3012 __ pushptr(Address(rcx, 0)); // Save return address 3013 __ enter(); // Save old & set new rbp 3014 __ subptr(rsp, rbx); // Prolog 3015 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), 3016 sender_sp); // Make it walkable 3017 // This value is corrected by layout_activation_impl 3018 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD); 3019 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 3020 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 3021 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 3022 __ decrementl(rdx); // Decrement counter 3023 __ jcc(Assembler::notZero, loop); 3024 __ pushptr(Address(rcx, 0)); // Save final return address 3025 3026 // Re-push self-frame 3027 __ enter(); // Save old & set new rbp 3028 __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt); 3029 // Prolog 3030 3031 // Use rbp because the frames look interpreted now 3032 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 3033 // Don't need the precise return PC here, just precise enough to point into this code blob. 3034 address the_pc = __ pc(); 3035 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1); 3036 3037 // Call C code. Need thread but NOT official VM entry 3038 // crud. We cannot block on this call, no GC can happen. Call should 3039 // restore return values to their stack-slots with the new SP. 3040 // Thread is in rdi already. 3041 // 3042 // BasicType unpack_frames(JavaThread* thread, int exec_mode); 3043 3044 __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI 3045 __ mov(c_rarg0, r15_thread); 3046 __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap); 3047 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 3048 3049 // Set an oopmap for the call site 3050 // Use the same PC we used for the last java frame 3051 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 3052 3053 // Clear fp AND pc 3054 __ reset_last_Java_frame(true); 3055 3056 // Pop self-frame. 3057 __ leave(); // Epilog 3058 3059 // Jump to interpreter 3060 __ ret(0); 3061 3062 // Make sure all code is generated 3063 masm->flush(); 3064 3065 _uncommon_trap_blob = UncommonTrapBlob::create(&buffer, oop_maps, 3066 SimpleRuntimeFrame::framesize >> 1); 3067 } 3068 #endif // COMPILER2 3069 3070 //------------------------------generate_handler_blob------ 3071 // 3072 // Generate a special Compile2Runtime blob that saves all registers, 3073 // and setup oopmap. 3074 // 3075 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) { 3076 assert(StubRoutines::forward_exception_entry() != nullptr, 3077 "must be generated before"); 3078 3079 ResourceMark rm; 3080 OopMapSet *oop_maps = new OopMapSet(); 3081 OopMap* map; 3082 3083 // Allocate space for the code. Setup code generation tools. 3084 CodeBuffer buffer("handler_blob", 2048, 1024); 3085 MacroAssembler* masm = new MacroAssembler(&buffer); 3086 3087 address start = __ pc(); 3088 address call_pc = nullptr; 3089 int frame_size_in_words; 3090 bool cause_return = (poll_type == POLL_AT_RETURN); 3091 bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP); 3092 3093 if (UseRTMLocking) { 3094 // Abort RTM transaction before calling runtime 3095 // because critical section will be large and will be 3096 // aborted anyway. Also nmethod could be deoptimized. 3097 __ xabort(0); 3098 } 3099 3100 // Make room for return address (or push it again) 3101 if (!cause_return) { 3102 __ push(rbx); 3103 } 3104 3105 // Save registers, fpu state, and flags 3106 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors); 3107 3108 // The following is basically a call_VM. However, we need the precise 3109 // address of the call in order to generate an oopmap. Hence, we do all the 3110 // work ourselves. 3111 3112 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next: 3113 3114 // The return address must always be correct so that frame constructor never 3115 // sees an invalid pc. 3116 3117 if (!cause_return) { 3118 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack. 3119 // Additionally, rbx is a callee saved register and we can look at it later to determine 3120 // if someone changed the return address for us! 3121 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset())); 3122 __ movptr(Address(rbp, wordSize), rbx); 3123 } 3124 3125 // Do the call 3126 __ mov(c_rarg0, r15_thread); 3127 __ call(RuntimeAddress(call_ptr)); 3128 3129 // Set an oopmap for the call site. This oopmap will map all 3130 // oop-registers and debug-info registers as callee-saved. This 3131 // will allow deoptimization at this safepoint to find all possible 3132 // debug-info recordings, as well as let GC find all oops. 3133 3134 oop_maps->add_gc_map( __ pc() - start, map); 3135 3136 Label noException; 3137 3138 __ reset_last_Java_frame(false); 3139 3140 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3141 __ jcc(Assembler::equal, noException); 3142 3143 // Exception pending 3144 3145 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3146 3147 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3148 3149 // No exception case 3150 __ bind(noException); 3151 3152 Label no_adjust; 3153 #ifdef ASSERT 3154 Label bail; 3155 #endif 3156 if (!cause_return) { 3157 Label no_prefix, not_special; 3158 3159 // If our stashed return pc was modified by the runtime we avoid touching it 3160 __ cmpptr(rbx, Address(rbp, wordSize)); 3161 __ jccb(Assembler::notEqual, no_adjust); 3162 3163 // Skip over the poll instruction. 3164 // See NativeInstruction::is_safepoint_poll() 3165 // Possible encodings: 3166 // 85 00 test %eax,(%rax) 3167 // 85 01 test %eax,(%rcx) 3168 // 85 02 test %eax,(%rdx) 3169 // 85 03 test %eax,(%rbx) 3170 // 85 06 test %eax,(%rsi) 3171 // 85 07 test %eax,(%rdi) 3172 // 3173 // 41 85 00 test %eax,(%r8) 3174 // 41 85 01 test %eax,(%r9) 3175 // 41 85 02 test %eax,(%r10) 3176 // 41 85 03 test %eax,(%r11) 3177 // 41 85 06 test %eax,(%r14) 3178 // 41 85 07 test %eax,(%r15) 3179 // 3180 // 85 04 24 test %eax,(%rsp) 3181 // 41 85 04 24 test %eax,(%r12) 3182 // 85 45 00 test %eax,0x0(%rbp) 3183 // 41 85 45 00 test %eax,0x0(%r13) 3184 3185 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix); 3186 __ jcc(Assembler::notEqual, no_prefix); 3187 __ addptr(rbx, 1); 3188 __ bind(no_prefix); 3189 #ifdef ASSERT 3190 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below 3191 #endif 3192 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values: 3193 // r12/rsp 0x04 3194 // r13/rbp 0x05 3195 __ movzbq(rcx, Address(rbx, 1)); 3196 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05 3197 __ subptr(rcx, 4); // looking for 0x00 .. 0x01 3198 __ cmpptr(rcx, 1); 3199 __ jcc(Assembler::above, not_special); 3200 __ addptr(rbx, 1); 3201 __ bind(not_special); 3202 #ifdef ASSERT 3203 // Verify the correct encoding of the poll we're about to skip. 3204 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl); 3205 __ jcc(Assembler::notEqual, bail); 3206 // Mask out the modrm bits 3207 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask); 3208 // rax encodes to 0, so if the bits are nonzero it's incorrect 3209 __ jcc(Assembler::notZero, bail); 3210 #endif 3211 // Adjust return pc forward to step over the safepoint poll instruction 3212 __ addptr(rbx, 2); 3213 __ movptr(Address(rbp, wordSize), rbx); 3214 } 3215 3216 __ bind(no_adjust); 3217 // Normal exit, restore registers and exit. 3218 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3219 __ ret(0); 3220 3221 #ifdef ASSERT 3222 __ bind(bail); 3223 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected"); 3224 #endif 3225 3226 // Make sure all code is generated 3227 masm->flush(); 3228 3229 // Fill-out other meta info 3230 return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words); 3231 } 3232 3233 // 3234 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss 3235 // 3236 // Generate a stub that calls into vm to find out the proper destination 3237 // of a java call. All the argument registers are live at this point 3238 // but since this is generic code we don't know what they are and the caller 3239 // must do any gc of the args. 3240 // 3241 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) { 3242 assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before"); 3243 3244 // allocate space for the code 3245 ResourceMark rm; 3246 3247 CodeBuffer buffer(name, 1200, 512); 3248 MacroAssembler* masm = new MacroAssembler(&buffer); 3249 3250 int frame_size_in_words; 3251 3252 OopMapSet *oop_maps = new OopMapSet(); 3253 OopMap* map = nullptr; 3254 3255 int start = __ offset(); 3256 3257 // No need to save vector registers since they are caller-saved anyway. 3258 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false); 3259 3260 int frame_complete = __ offset(); 3261 3262 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 3263 3264 __ mov(c_rarg0, r15_thread); 3265 3266 __ call(RuntimeAddress(destination)); 3267 3268 3269 // Set an oopmap for the call site. 3270 // We need this not only for callee-saved registers, but also for volatile 3271 // registers that the compiler might be keeping live across a safepoint. 3272 3273 oop_maps->add_gc_map( __ offset() - start, map); 3274 3275 // rax contains the address we are going to jump to assuming no exception got installed 3276 3277 // clear last_Java_sp 3278 __ reset_last_Java_frame(false); 3279 // check for pending exceptions 3280 Label pending; 3281 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3282 __ jcc(Assembler::notEqual, pending); 3283 3284 // get the returned Method* 3285 __ get_vm_result_2(rbx, r15_thread); 3286 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx); 3287 3288 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 3289 3290 RegisterSaver::restore_live_registers(masm); 3291 3292 // We are back to the original state on entry and ready to go. 3293 3294 __ jmp(rax); 3295 3296 // Pending exception after the safepoint 3297 3298 __ bind(pending); 3299 3300 RegisterSaver::restore_live_registers(masm); 3301 3302 // exception pending => remove activation and forward to exception handler 3303 3304 __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD); 3305 3306 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 3307 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3308 3309 // ------------- 3310 // make sure all code is generated 3311 masm->flush(); 3312 3313 // return the blob 3314 // frame_size_words or bytes?? 3315 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); 3316 } 3317 3318 //------------------------------Montgomery multiplication------------------------ 3319 // 3320 3321 #ifndef _WINDOWS 3322 3323 // Subtract 0:b from carry:a. Return carry. 3324 static julong 3325 sub(julong a[], julong b[], julong carry, long len) { 3326 long long i = 0, cnt = len; 3327 julong tmp; 3328 asm volatile("clc; " 3329 "0: ; " 3330 "mov (%[b], %[i], 8), %[tmp]; " 3331 "sbb %[tmp], (%[a], %[i], 8); " 3332 "inc %[i]; dec %[cnt]; " 3333 "jne 0b; " 3334 "mov %[carry], %[tmp]; sbb $0, %[tmp]; " 3335 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp) 3336 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry) 3337 : "memory"); 3338 return tmp; 3339 } 3340 3341 // Multiply (unsigned) Long A by Long B, accumulating the double- 3342 // length result into the accumulator formed of T0, T1, and T2. 3343 #define MACC(A, B, T0, T1, T2) \ 3344 do { \ 3345 unsigned long hi, lo; \ 3346 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3347 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3348 : "r"(A), "a"(B) : "cc"); \ 3349 } while(0) 3350 3351 // As above, but add twice the double-length result into the 3352 // accumulator. 3353 #define MACC2(A, B, T0, T1, T2) \ 3354 do { \ 3355 unsigned long hi, lo; \ 3356 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \ 3357 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3358 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3359 : "r"(A), "a"(B) : "cc"); \ 3360 } while(0) 3361 3362 #else //_WINDOWS 3363 3364 static julong 3365 sub(julong a[], julong b[], julong carry, long len) { 3366 long i; 3367 julong tmp; 3368 unsigned char c = 1; 3369 for (i = 0; i < len; i++) { 3370 c = _addcarry_u64(c, a[i], ~b[i], &tmp); 3371 a[i] = tmp; 3372 } 3373 c = _addcarry_u64(c, carry, ~0, &tmp); 3374 return tmp; 3375 } 3376 3377 // Multiply (unsigned) Long A by Long B, accumulating the double- 3378 // length result into the accumulator formed of T0, T1, and T2. 3379 #define MACC(A, B, T0, T1, T2) \ 3380 do { \ 3381 julong hi, lo; \ 3382 lo = _umul128(A, B, &hi); \ 3383 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3384 c = _addcarry_u64(c, hi, T1, &T1); \ 3385 _addcarry_u64(c, T2, 0, &T2); \ 3386 } while(0) 3387 3388 // As above, but add twice the double-length result into the 3389 // accumulator. 3390 #define MACC2(A, B, T0, T1, T2) \ 3391 do { \ 3392 julong hi, lo; \ 3393 lo = _umul128(A, B, &hi); \ 3394 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3395 c = _addcarry_u64(c, hi, T1, &T1); \ 3396 _addcarry_u64(c, T2, 0, &T2); \ 3397 c = _addcarry_u64(0, lo, T0, &T0); \ 3398 c = _addcarry_u64(c, hi, T1, &T1); \ 3399 _addcarry_u64(c, T2, 0, &T2); \ 3400 } while(0) 3401 3402 #endif //_WINDOWS 3403 3404 // Fast Montgomery multiplication. The derivation of the algorithm is 3405 // in A Cryptographic Library for the Motorola DSP56000, 3406 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3407 3408 static void NOINLINE 3409 montgomery_multiply(julong a[], julong b[], julong n[], 3410 julong m[], julong inv, int len) { 3411 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3412 int i; 3413 3414 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply"); 3415 3416 for (i = 0; i < len; i++) { 3417 int j; 3418 for (j = 0; j < i; j++) { 3419 MACC(a[j], b[i-j], t0, t1, t2); 3420 MACC(m[j], n[i-j], t0, t1, t2); 3421 } 3422 MACC(a[i], b[0], t0, t1, t2); 3423 m[i] = t0 * inv; 3424 MACC(m[i], n[0], t0, t1, t2); 3425 3426 assert(t0 == 0, "broken Montgomery multiply"); 3427 3428 t0 = t1; t1 = t2; t2 = 0; 3429 } 3430 3431 for (i = len; i < 2*len; i++) { 3432 int j; 3433 for (j = i-len+1; j < len; j++) { 3434 MACC(a[j], b[i-j], t0, t1, t2); 3435 MACC(m[j], n[i-j], t0, t1, t2); 3436 } 3437 m[i-len] = t0; 3438 t0 = t1; t1 = t2; t2 = 0; 3439 } 3440 3441 while (t0) 3442 t0 = sub(m, n, t0, len); 3443 } 3444 3445 // Fast Montgomery squaring. This uses asymptotically 25% fewer 3446 // multiplies so it should be up to 25% faster than Montgomery 3447 // multiplication. However, its loop control is more complex and it 3448 // may actually run slower on some machines. 3449 3450 static void NOINLINE 3451 montgomery_square(julong a[], julong n[], 3452 julong m[], julong inv, int len) { 3453 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3454 int i; 3455 3456 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square"); 3457 3458 for (i = 0; i < len; i++) { 3459 int j; 3460 int end = (i+1)/2; 3461 for (j = 0; j < end; j++) { 3462 MACC2(a[j], a[i-j], t0, t1, t2); 3463 MACC(m[j], n[i-j], t0, t1, t2); 3464 } 3465 if ((i & 1) == 0) { 3466 MACC(a[j], a[j], t0, t1, t2); 3467 } 3468 for (; j < i; j++) { 3469 MACC(m[j], n[i-j], t0, t1, t2); 3470 } 3471 m[i] = t0 * inv; 3472 MACC(m[i], n[0], t0, t1, t2); 3473 3474 assert(t0 == 0, "broken Montgomery square"); 3475 3476 t0 = t1; t1 = t2; t2 = 0; 3477 } 3478 3479 for (i = len; i < 2*len; i++) { 3480 int start = i-len+1; 3481 int end = start + (len - start)/2; 3482 int j; 3483 for (j = start; j < end; j++) { 3484 MACC2(a[j], a[i-j], t0, t1, t2); 3485 MACC(m[j], n[i-j], t0, t1, t2); 3486 } 3487 if ((i & 1) == 0) { 3488 MACC(a[j], a[j], t0, t1, t2); 3489 } 3490 for (; j < len; j++) { 3491 MACC(m[j], n[i-j], t0, t1, t2); 3492 } 3493 m[i-len] = t0; 3494 t0 = t1; t1 = t2; t2 = 0; 3495 } 3496 3497 while (t0) 3498 t0 = sub(m, n, t0, len); 3499 } 3500 3501 // Swap words in a longword. 3502 static julong swap(julong x) { 3503 return (x << 32) | (x >> 32); 3504 } 3505 3506 // Copy len longwords from s to d, word-swapping as we go. The 3507 // destination array is reversed. 3508 static void reverse_words(julong *s, julong *d, int len) { 3509 d += len; 3510 while(len-- > 0) { 3511 d--; 3512 *d = swap(*s); 3513 s++; 3514 } 3515 } 3516 3517 // The threshold at which squaring is advantageous was determined 3518 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. 3519 #define MONTGOMERY_SQUARING_THRESHOLD 64 3520 3521 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, 3522 jint len, jlong inv, 3523 jint *m_ints) { 3524 assert(len % 2 == 0, "array length in montgomery_multiply must be even"); 3525 int longwords = len/2; 3526 3527 // Make very sure we don't use so much space that the stack might 3528 // overflow. 512 jints corresponds to an 16384-bit integer and 3529 // will use here a total of 8k bytes of stack space. 3530 int divisor = sizeof(julong) * 4; 3531 guarantee(longwords <= 8192 / divisor, "must be"); 3532 int total_allocation = longwords * sizeof (julong) * 4; 3533 julong *scratch = (julong *)alloca(total_allocation); 3534 3535 // Local scratch arrays 3536 julong 3537 *a = scratch + 0 * longwords, 3538 *b = scratch + 1 * longwords, 3539 *n = scratch + 2 * longwords, 3540 *m = scratch + 3 * longwords; 3541 3542 reverse_words((julong *)a_ints, a, longwords); 3543 reverse_words((julong *)b_ints, b, longwords); 3544 reverse_words((julong *)n_ints, n, longwords); 3545 3546 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords); 3547 3548 reverse_words(m, (julong *)m_ints, longwords); 3549 } 3550 3551 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, 3552 jint len, jlong inv, 3553 jint *m_ints) { 3554 assert(len % 2 == 0, "array length in montgomery_square must be even"); 3555 int longwords = len/2; 3556 3557 // Make very sure we don't use so much space that the stack might 3558 // overflow. 512 jints corresponds to an 16384-bit integer and 3559 // will use here a total of 6k bytes of stack space. 3560 int divisor = sizeof(julong) * 3; 3561 guarantee(longwords <= (8192 / divisor), "must be"); 3562 int total_allocation = longwords * sizeof (julong) * 3; 3563 julong *scratch = (julong *)alloca(total_allocation); 3564 3565 // Local scratch arrays 3566 julong 3567 *a = scratch + 0 * longwords, 3568 *n = scratch + 1 * longwords, 3569 *m = scratch + 2 * longwords; 3570 3571 reverse_words((julong *)a_ints, a, longwords); 3572 reverse_words((julong *)n_ints, n, longwords); 3573 3574 if (len >= MONTGOMERY_SQUARING_THRESHOLD) { 3575 ::montgomery_square(a, n, m, (julong)inv, longwords); 3576 } else { 3577 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords); 3578 } 3579 3580 reverse_words(m, (julong *)m_ints, longwords); 3581 } 3582 3583 #ifdef COMPILER2 3584 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame 3585 // 3586 //------------------------------generate_exception_blob--------------------------- 3587 // creates exception blob at the end 3588 // Using exception blob, this code is jumped from a compiled method. 3589 // (see emit_exception_handler in x86_64.ad file) 3590 // 3591 // Given an exception pc at a call we call into the runtime for the 3592 // handler in this method. This handler might merely restore state 3593 // (i.e. callee save registers) unwind the frame and jump to the 3594 // exception handler for the nmethod if there is no Java level handler 3595 // for the nmethod. 3596 // 3597 // This code is entered with a jmp. 3598 // 3599 // Arguments: 3600 // rax: exception oop 3601 // rdx: exception pc 3602 // 3603 // Results: 3604 // rax: exception oop 3605 // rdx: exception pc in caller or ??? 3606 // destination: exception handler of caller 3607 // 3608 // Note: the exception pc MUST be at a call (precise debug information) 3609 // Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved. 3610 // 3611 3612 void OptoRuntime::generate_exception_blob() { 3613 assert(!OptoRuntime::is_callee_saved_register(RDX_num), ""); 3614 assert(!OptoRuntime::is_callee_saved_register(RAX_num), ""); 3615 assert(!OptoRuntime::is_callee_saved_register(RCX_num), ""); 3616 3617 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 3618 3619 // Allocate space for the code 3620 ResourceMark rm; 3621 // Setup code generation tools 3622 CodeBuffer buffer("exception_blob", 2048, 1024); 3623 MacroAssembler* masm = new MacroAssembler(&buffer); 3624 3625 3626 address start = __ pc(); 3627 3628 // Exception pc is 'return address' for stack walker 3629 __ push(rdx); 3630 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog 3631 3632 // Save callee-saved registers. See x86_64.ad. 3633 3634 // rbp is an implicitly saved callee saved register (i.e., the calling 3635 // convention will save/restore it in the prolog/epilog). Other than that 3636 // there are no callee save registers now that adapter frames are gone. 3637 3638 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 3639 3640 // Store exception in Thread object. We cannot pass any arguments to the 3641 // handle_exception call, since we do not want to make any assumption 3642 // about the size of the frame where the exception happened in. 3643 // c_rarg0 is either rdi (Linux) or rcx (Windows). 3644 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax); 3645 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 3646 3647 // This call does all the hard work. It checks if an exception handler 3648 // exists in the method. 3649 // If so, it returns the handler address. 3650 // If not, it prepares for stack-unwinding, restoring the callee-save 3651 // registers of the frame being removed. 3652 // 3653 // address OptoRuntime::handle_exception_C(JavaThread* thread) 3654 3655 // At a method handle call, the stack may not be properly aligned 3656 // when returning with an exception. 3657 address the_pc = __ pc(); 3658 __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1); 3659 __ mov(c_rarg0, r15_thread); 3660 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack 3661 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C))); 3662 3663 // Set an oopmap for the call site. This oopmap will only be used if we 3664 // are unwinding the stack. Hence, all locations will be dead. 3665 // Callee-saved registers will be the same as the frame above (i.e., 3666 // handle_exception_stub), since they were restored when we got the 3667 // exception. 3668 3669 OopMapSet* oop_maps = new OopMapSet(); 3670 3671 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 3672 3673 __ reset_last_Java_frame(false); 3674 3675 // Restore callee-saved registers 3676 3677 // rbp is an implicitly saved callee-saved register (i.e., the calling 3678 // convention will save restore it in prolog/epilog) Other than that 3679 // there are no callee save registers now that adapter frames are gone. 3680 3681 __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt)); 3682 3683 __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog 3684 __ pop(rdx); // No need for exception pc anymore 3685 3686 // rax: exception handler 3687 3688 // We have a handler in rax (could be deopt blob). 3689 __ mov(r8, rax); 3690 3691 // Get the exception oop 3692 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 3693 // Get the exception pc in case we are deoptimized 3694 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 3695 #ifdef ASSERT 3696 __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD); 3697 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 3698 #endif 3699 // Clear the exception oop so GC no longer processes it as a root. 3700 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD); 3701 3702 // rax: exception oop 3703 // r8: exception handler 3704 // rdx: exception pc 3705 // Jump to handler 3706 3707 __ jmp(r8); 3708 3709 // Make sure all code is generated 3710 masm->flush(); 3711 3712 // Set exception blob 3713 _exception_blob = ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1); 3714 } 3715 #endif // COMPILER2 3716