1 /* 2 * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #ifndef _WINDOWS 27 #include "alloca.h" 28 #endif 29 #include "asm/macroAssembler.hpp" 30 #include "asm/macroAssembler.inline.hpp" 31 #include "code/compiledIC.hpp" 32 #include "code/debugInfoRec.hpp" 33 #include "code/icBuffer.hpp" 34 #include "code/nativeInst.hpp" 35 #include "code/vtableStubs.hpp" 36 #include "compiler/oopMap.hpp" 37 #include "gc/shared/collectedHeap.hpp" 38 #include "gc/shared/gcLocker.hpp" 39 #include "gc/shared/barrierSet.hpp" 40 #include "gc/shared/barrierSetAssembler.hpp" 41 #include "interpreter/interpreter.hpp" 42 #include "logging/log.hpp" 43 #include "memory/resourceArea.hpp" 44 #include "memory/universe.hpp" 45 #include "oops/compiledICHolder.hpp" 46 #include "oops/klass.inline.hpp" 47 #include "oops/method.inline.hpp" 48 #include "prims/methodHandles.hpp" 49 #include "runtime/continuation.hpp" 50 #include "runtime/continuationEntry.inline.hpp" 51 #include "runtime/globals.hpp" 52 #include "runtime/jniHandles.hpp" 53 #include "runtime/safepointMechanism.hpp" 54 #include "runtime/sharedRuntime.hpp" 55 #include "runtime/signature.hpp" 56 #include "runtime/stubRoutines.hpp" 57 #include "runtime/vframeArray.hpp" 58 #include "runtime/vm_version.hpp" 59 #include "utilities/align.hpp" 60 #include "utilities/formatBuffer.hpp" 61 #include "vmreg_x86.inline.hpp" 62 #ifdef COMPILER1 63 #include "c1/c1_Runtime1.hpp" 64 #endif 65 #ifdef COMPILER2 66 #include "opto/runtime.hpp" 67 #endif 68 #if INCLUDE_JVMCI 69 #include "jvmci/jvmciJavaClasses.hpp" 70 #endif 71 72 #define __ masm-> 73 74 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size; 75 76 class SimpleRuntimeFrame { 77 78 public: 79 80 // Most of the runtime stubs have this simple frame layout. 81 // This class exists to make the layout shared in one place. 82 // Offsets are for compiler stack slots, which are jints. 83 enum layout { 84 // The frame sender code expects that rbp will be in the "natural" place and 85 // will override any oopMap setting for it. We must therefore force the layout 86 // so that it agrees with the frame sender code. 87 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt, 88 rbp_off2, 89 return_off, return_off2, 90 framesize 91 }; 92 }; 93 94 class RegisterSaver { 95 // Capture info about frame layout. Layout offsets are in jint 96 // units because compiler frame slots are jints. 97 #define XSAVE_AREA_BEGIN 160 98 #define XSAVE_AREA_YMM_BEGIN 576 99 #define XSAVE_AREA_OPMASK_BEGIN 1088 100 #define XSAVE_AREA_ZMM_BEGIN 1152 101 #define XSAVE_AREA_UPPERBANK 1664 102 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off 103 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off 104 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off 105 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off 106 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off 107 enum layout { 108 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area 109 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area 110 DEF_XMM_OFFS(0), 111 DEF_XMM_OFFS(1), 112 // 2..15 are implied in range usage 113 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 114 DEF_YMM_OFFS(0), 115 DEF_YMM_OFFS(1), 116 // 2..15 are implied in range usage 117 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 118 DEF_OPMASK_OFFS(0), 119 DEF_OPMASK_OFFS(1), 120 // 2..7 are implied in range usage 121 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 122 DEF_ZMM_OFFS(0), 123 DEF_ZMM_OFFS(1), 124 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt, 125 DEF_ZMM_UPPER_OFFS(16), 126 DEF_ZMM_UPPER_OFFS(17), 127 // 18..31 are implied in range usage 128 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt), 129 fpu_stateH_end, 130 r15_off, r15H_off, 131 r14_off, r14H_off, 132 r13_off, r13H_off, 133 r12_off, r12H_off, 134 r11_off, r11H_off, 135 r10_off, r10H_off, 136 r9_off, r9H_off, 137 r8_off, r8H_off, 138 rdi_off, rdiH_off, 139 rsi_off, rsiH_off, 140 ignore_off, ignoreH_off, // extra copy of rbp 141 rsp_off, rspH_off, 142 rbx_off, rbxH_off, 143 rdx_off, rdxH_off, 144 rcx_off, rcxH_off, 145 rax_off, raxH_off, 146 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state 147 align_off, alignH_off, 148 flags_off, flagsH_off, 149 // The frame sender code expects that rbp will be in the "natural" place and 150 // will override any oopMap setting for it. We must therefore force the layout 151 // so that it agrees with the frame sender code. 152 rbp_off, rbpH_off, // copy of rbp we will restore 153 return_off, returnH_off, // slot for return address 154 reg_save_size // size in compiler stack slots 155 }; 156 157 public: 158 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors); 159 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false); 160 161 // Offsets into the register save area 162 // Used by deoptimization when it is managing result register 163 // values on its own 164 165 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; } 166 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; } 167 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; } 168 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; } 169 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; } 170 171 // During deoptimization only the result registers need to be restored, 172 // all the other values have already been extracted. 173 static void restore_result_registers(MacroAssembler* masm); 174 }; 175 176 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) { 177 int off = 0; 178 int num_xmm_regs = XMMRegister::available_xmm_registers(); 179 #if COMPILER2_OR_JVMCI 180 if (save_wide_vectors && UseAVX == 0) { 181 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX 182 } 183 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 184 #else 185 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI 186 #endif 187 188 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated 189 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs); 190 // OopMap frame size is in compiler stack slots (jint's) not bytes or words 191 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 192 // CodeBlob frame size is in words. 193 int frame_size_in_words = frame_size_in_bytes / wordSize; 194 *total_frame_words = frame_size_in_words; 195 196 // Save registers, fpu state, and flags. 197 // We assume caller has already pushed the return address onto the 198 // stack, so rsp is 8-byte aligned here. 199 // We push rpb twice in this sequence because we want the real rbp 200 // to be under the return like a normal enter. 201 202 __ enter(); // rsp becomes 16-byte aligned here 203 __ push_CPU_state(); // Push a multiple of 16 bytes 204 205 // push cpu state handles this on EVEX enabled targets 206 if (save_wide_vectors) { 207 // Save upper half of YMM registers(0..15) 208 int base_addr = XSAVE_AREA_YMM_BEGIN; 209 for (int n = 0; n < 16; n++) { 210 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n)); 211 } 212 if (VM_Version::supports_evex()) { 213 // Save upper half of ZMM registers(0..15) 214 base_addr = XSAVE_AREA_ZMM_BEGIN; 215 for (int n = 0; n < 16; n++) { 216 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n)); 217 } 218 // Save full ZMM registers(16..num_xmm_regs) 219 base_addr = XSAVE_AREA_UPPERBANK; 220 off = 0; 221 int vector_len = Assembler::AVX_512bit; 222 for (int n = 16; n < num_xmm_regs; n++) { 223 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 224 } 225 #if COMPILER2_OR_JVMCI 226 base_addr = XSAVE_AREA_OPMASK_BEGIN; 227 off = 0; 228 for(int n = 0; n < KRegister::number_of_registers; n++) { 229 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 230 } 231 #endif 232 } 233 } else { 234 if (VM_Version::supports_evex()) { 235 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 236 int base_addr = XSAVE_AREA_UPPERBANK; 237 off = 0; 238 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 239 for (int n = 16; n < num_xmm_regs; n++) { 240 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 241 } 242 #if COMPILER2_OR_JVMCI 243 base_addr = XSAVE_AREA_OPMASK_BEGIN; 244 off = 0; 245 for(int n = 0; n < KRegister::number_of_registers; n++) { 246 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 247 } 248 #endif 249 } 250 } 251 __ vzeroupper(); 252 if (frame::arg_reg_save_area_bytes != 0) { 253 // Allocate argument register save area 254 __ subptr(rsp, frame::arg_reg_save_area_bytes); 255 } 256 257 // Set an oopmap for the call site. This oopmap will map all 258 // oop-registers and debug-info registers as callee-saved. This 259 // will allow deoptimization at this safepoint to find all possible 260 // debug-info recordings, as well as let GC find all oops. 261 262 OopMapSet *oop_maps = new OopMapSet(); 263 OopMap* map = new OopMap(frame_size_in_slots, 0); 264 265 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x)) 266 267 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg()); 268 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg()); 269 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg()); 270 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg()); 271 // rbp location is known implicitly by the frame sender code, needs no oopmap 272 // and the location where rbp was saved by is ignored 273 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg()); 274 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg()); 275 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg()); 276 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg()); 277 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg()); 278 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg()); 279 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg()); 280 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg()); 281 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg()); 282 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg()); 283 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 284 // on EVEX enabled targets, we get it included in the xsave area 285 off = xmm0_off; 286 int delta = xmm1_off - off; 287 for (int n = 0; n < 16; n++) { 288 XMMRegister xmm_name = as_XMMRegister(n); 289 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()); 290 off += delta; 291 } 292 if (UseAVX > 2) { 293 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 294 off = zmm16_off; 295 delta = zmm17_off - off; 296 for (int n = 16; n < num_xmm_regs; n++) { 297 XMMRegister zmm_name = as_XMMRegister(n); 298 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()); 299 off += delta; 300 } 301 } 302 303 #if COMPILER2_OR_JVMCI 304 if (save_wide_vectors) { 305 // Save upper half of YMM registers(0..15) 306 off = ymm0_off; 307 delta = ymm1_off - ymm0_off; 308 for (int n = 0; n < 16; n++) { 309 XMMRegister ymm_name = as_XMMRegister(n); 310 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4)); 311 off += delta; 312 } 313 if (VM_Version::supports_evex()) { 314 // Save upper half of ZMM registers(0..15) 315 off = zmm0_off; 316 delta = zmm1_off - zmm0_off; 317 for (int n = 0; n < 16; n++) { 318 XMMRegister zmm_name = as_XMMRegister(n); 319 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8)); 320 off += delta; 321 } 322 } 323 } 324 #endif // COMPILER2_OR_JVMCI 325 326 // %%% These should all be a waste but we'll keep things as they were for now 327 if (true) { 328 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next()); 329 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next()); 330 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next()); 331 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next()); 332 // rbp location is known implicitly by the frame sender code, needs no oopmap 333 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next()); 334 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next()); 335 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next()); 336 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next()); 337 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next()); 338 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next()); 339 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next()); 340 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next()); 341 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next()); 342 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next()); 343 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 344 // on EVEX enabled targets, we get it included in the xsave area 345 off = xmm0H_off; 346 delta = xmm1H_off - off; 347 for (int n = 0; n < 16; n++) { 348 XMMRegister xmm_name = as_XMMRegister(n); 349 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next()); 350 off += delta; 351 } 352 if (UseAVX > 2) { 353 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 354 off = zmm16H_off; 355 delta = zmm17H_off - off; 356 for (int n = 16; n < num_xmm_regs; n++) { 357 XMMRegister zmm_name = as_XMMRegister(n); 358 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next()); 359 off += delta; 360 } 361 } 362 } 363 364 return map; 365 } 366 367 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) { 368 int num_xmm_regs = XMMRegister::available_xmm_registers(); 369 if (frame::arg_reg_save_area_bytes != 0) { 370 // Pop arg register save area 371 __ addptr(rsp, frame::arg_reg_save_area_bytes); 372 } 373 374 #if COMPILER2_OR_JVMCI 375 if (restore_wide_vectors) { 376 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); 377 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 378 } 379 #else 380 assert(!restore_wide_vectors, "vectors are generated only by C2"); 381 #endif 382 383 __ vzeroupper(); 384 385 // On EVEX enabled targets everything is handled in pop fpu state 386 if (restore_wide_vectors) { 387 // Restore upper half of YMM registers (0..15) 388 int base_addr = XSAVE_AREA_YMM_BEGIN; 389 for (int n = 0; n < 16; n++) { 390 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16)); 391 } 392 if (VM_Version::supports_evex()) { 393 // Restore upper half of ZMM registers (0..15) 394 base_addr = XSAVE_AREA_ZMM_BEGIN; 395 for (int n = 0; n < 16; n++) { 396 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32)); 397 } 398 // Restore full ZMM registers(16..num_xmm_regs) 399 base_addr = XSAVE_AREA_UPPERBANK; 400 int vector_len = Assembler::AVX_512bit; 401 int off = 0; 402 for (int n = 16; n < num_xmm_regs; n++) { 403 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 404 } 405 #if COMPILER2_OR_JVMCI 406 base_addr = XSAVE_AREA_OPMASK_BEGIN; 407 off = 0; 408 for (int n = 0; n < KRegister::number_of_registers; n++) { 409 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 410 } 411 #endif 412 } 413 } else { 414 if (VM_Version::supports_evex()) { 415 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 416 int base_addr = XSAVE_AREA_UPPERBANK; 417 int off = 0; 418 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 419 for (int n = 16; n < num_xmm_regs; n++) { 420 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 421 } 422 #if COMPILER2_OR_JVMCI 423 base_addr = XSAVE_AREA_OPMASK_BEGIN; 424 off = 0; 425 for (int n = 0; n < KRegister::number_of_registers; n++) { 426 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 427 } 428 #endif 429 } 430 } 431 432 // Recover CPU state 433 __ pop_CPU_state(); 434 // Get the rbp described implicitly by the calling convention (no oopMap) 435 __ pop(rbp); 436 } 437 438 void RegisterSaver::restore_result_registers(MacroAssembler* masm) { 439 440 // Just restore result register. Only used by deoptimization. By 441 // now any callee save register that needs to be restored to a c2 442 // caller of the deoptee has been extracted into the vframeArray 443 // and will be stuffed into the c2i adapter we create for later 444 // restoration so only result registers need to be restored here. 445 446 // Restore fp result register 447 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes())); 448 // Restore integer result register 449 __ movptr(rax, Address(rsp, rax_offset_in_bytes())); 450 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes())); 451 452 // Pop all of the register save are off the stack except the return address 453 __ addptr(rsp, return_offset_in_bytes()); 454 } 455 456 // Is vector's size (in bytes) bigger than a size saved by default? 457 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions. 458 bool SharedRuntime::is_wide_vector(int size) { 459 return size > 16; 460 } 461 462 // --------------------------------------------------------------------------- 463 // Read the array of BasicTypes from a signature, and compute where the 464 // arguments should go. Values in the VMRegPair regs array refer to 4-byte 465 // quantities. Values less than VMRegImpl::stack0 are registers, those above 466 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer 467 // as framesizes are fixed. 468 // VMRegImpl::stack0 refers to the first slot 0(sp). 469 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher. 470 // Register up to Register::number_of_registers are the 64-bit 471 // integer registers. 472 473 // Note: the INPUTS in sig_bt are in units of Java argument words, which are 474 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit 475 // units regardless of build. Of course for i486 there is no 64 bit build 476 477 // The Java calling convention is a "shifted" version of the C ABI. 478 // By skipping the first C ABI register we can call non-static jni methods 479 // with small numbers of arguments without having to shuffle the arguments 480 // at all. Since we control the java ABI we ought to at least get some 481 // advantage out of it. 482 483 int SharedRuntime::java_calling_convention(const BasicType *sig_bt, 484 VMRegPair *regs, 485 int total_args_passed) { 486 487 // Create the mapping between argument positions and 488 // registers. 489 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = { 490 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5 491 }; 492 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = { 493 j_farg0, j_farg1, j_farg2, j_farg3, 494 j_farg4, j_farg5, j_farg6, j_farg7 495 }; 496 497 498 uint int_args = 0; 499 uint fp_args = 0; 500 uint stk_args = 0; 501 502 for (int i = 0; i < total_args_passed; i++) { 503 switch (sig_bt[i]) { 504 case T_BOOLEAN: 505 case T_CHAR: 506 case T_BYTE: 507 case T_SHORT: 508 case T_INT: 509 if (int_args < Argument::n_int_register_parameters_j) { 510 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 511 } else { 512 stk_args = align_up(stk_args, 2); 513 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 514 stk_args += 1; 515 } 516 break; 517 case T_VOID: 518 // halves of T_LONG or T_DOUBLE 519 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 520 regs[i].set_bad(); 521 break; 522 case T_LONG: 523 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 524 // fall through 525 case T_OBJECT: 526 case T_ARRAY: 527 case T_ADDRESS: 528 if (int_args < Argument::n_int_register_parameters_j) { 529 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 530 } else { 531 stk_args = align_up(stk_args, 2); 532 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 533 stk_args += 2; 534 } 535 break; 536 case T_FLOAT: 537 if (fp_args < Argument::n_float_register_parameters_j) { 538 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 539 } else { 540 stk_args = align_up(stk_args, 2); 541 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 542 stk_args += 1; 543 } 544 break; 545 case T_DOUBLE: 546 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 547 if (fp_args < Argument::n_float_register_parameters_j) { 548 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 549 } else { 550 stk_args = align_up(stk_args, 2); 551 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 552 stk_args += 2; 553 } 554 break; 555 default: 556 ShouldNotReachHere(); 557 break; 558 } 559 } 560 561 return stk_args; 562 } 563 564 // Patch the callers callsite with entry to compiled code if it exists. 565 static void patch_callers_callsite(MacroAssembler *masm) { 566 Label L; 567 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 568 __ jcc(Assembler::equal, L); 569 570 // Save the current stack pointer 571 __ mov(r13, rsp); 572 // Schedule the branch target address early. 573 // Call into the VM to patch the caller, then jump to compiled callee 574 // rax isn't live so capture return address while we easily can 575 __ movptr(rax, Address(rsp, 0)); 576 577 // align stack so push_CPU_state doesn't fault 578 __ andptr(rsp, -(StackAlignmentInBytes)); 579 __ push_CPU_state(); 580 __ vzeroupper(); 581 // VM needs caller's callsite 582 // VM needs target method 583 // This needs to be a long call since we will relocate this adapter to 584 // the codeBuffer and it may not reach 585 586 // Allocate argument register save area 587 if (frame::arg_reg_save_area_bytes != 0) { 588 __ subptr(rsp, frame::arg_reg_save_area_bytes); 589 } 590 __ mov(c_rarg0, rbx); 591 __ mov(c_rarg1, rax); 592 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite))); 593 594 // De-allocate argument register save area 595 if (frame::arg_reg_save_area_bytes != 0) { 596 __ addptr(rsp, frame::arg_reg_save_area_bytes); 597 } 598 599 __ vzeroupper(); 600 __ pop_CPU_state(); 601 // restore sp 602 __ mov(rsp, r13); 603 __ bind(L); 604 } 605 606 607 static void gen_c2i_adapter(MacroAssembler *masm, 608 int total_args_passed, 609 int comp_args_on_stack, 610 const BasicType *sig_bt, 611 const VMRegPair *regs, 612 Label& skip_fixup) { 613 // Before we get into the guts of the C2I adapter, see if we should be here 614 // at all. We've come from compiled code and are attempting to jump to the 615 // interpreter, which means the caller made a static call to get here 616 // (vcalls always get a compiled target if there is one). Check for a 617 // compiled target. If there is one, we need to patch the caller's call. 618 patch_callers_callsite(masm); 619 620 __ bind(skip_fixup); 621 622 // Since all args are passed on the stack, total_args_passed * 623 // Interpreter::stackElementSize is the space we need. 624 625 assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed); 626 627 int extraspace = (total_args_passed * Interpreter::stackElementSize); 628 629 // stack is aligned, keep it that way 630 // This is not currently needed or enforced by the interpreter, but 631 // we might as well conform to the ABI. 632 extraspace = align_up(extraspace, 2*wordSize); 633 634 // set senderSP value 635 __ lea(r13, Address(rsp, wordSize)); 636 637 #ifdef ASSERT 638 __ check_stack_alignment(r13, "sender stack not aligned"); 639 #endif 640 if (extraspace > 0) { 641 // Pop the return address 642 __ pop(rax); 643 644 __ subptr(rsp, extraspace); 645 646 // Push the return address 647 __ push(rax); 648 649 // Account for the return address location since we store it first rather 650 // than hold it in a register across all the shuffling 651 extraspace += wordSize; 652 } 653 654 #ifdef ASSERT 655 __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax); 656 #endif 657 658 // Now write the args into the outgoing interpreter space 659 for (int i = 0; i < total_args_passed; i++) { 660 if (sig_bt[i] == T_VOID) { 661 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 662 continue; 663 } 664 665 // offset to start parameters 666 int st_off = (total_args_passed - i) * Interpreter::stackElementSize; 667 int next_off = st_off - Interpreter::stackElementSize; 668 669 // Say 4 args: 670 // i st_off 671 // 0 32 T_LONG 672 // 1 24 T_VOID 673 // 2 16 T_OBJECT 674 // 3 8 T_BOOL 675 // - 0 return address 676 // 677 // However to make thing extra confusing. Because we can fit a long/double in 678 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter 679 // leaves one slot empty and only stores to a single slot. In this case the 680 // slot that is occupied is the T_VOID slot. See I said it was confusing. 681 682 VMReg r_1 = regs[i].first(); 683 VMReg r_2 = regs[i].second(); 684 if (!r_1->is_valid()) { 685 assert(!r_2->is_valid(), ""); 686 continue; 687 } 688 if (r_1->is_stack()) { 689 // memory to memory use rax 690 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace; 691 if (!r_2->is_valid()) { 692 // sign extend?? 693 __ movl(rax, Address(rsp, ld_off)); 694 __ movptr(Address(rsp, st_off), rax); 695 696 } else { 697 698 __ movq(rax, Address(rsp, ld_off)); 699 700 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 701 // T_DOUBLE and T_LONG use two slots in the interpreter 702 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 703 // ld_off == LSW, ld_off+wordSize == MSW 704 // st_off == MSW, next_off == LSW 705 __ movq(Address(rsp, next_off), rax); 706 #ifdef ASSERT 707 // Overwrite the unused slot with known junk 708 __ mov64(rax, CONST64(0xdeadffffdeadaaaa)); 709 __ movptr(Address(rsp, st_off), rax); 710 #endif /* ASSERT */ 711 } else { 712 __ movq(Address(rsp, st_off), rax); 713 } 714 } 715 } else if (r_1->is_Register()) { 716 Register r = r_1->as_Register(); 717 if (!r_2->is_valid()) { 718 // must be only an int (or less ) so move only 32bits to slot 719 // why not sign extend?? 720 __ movl(Address(rsp, st_off), r); 721 } else { 722 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 723 // T_DOUBLE and T_LONG use two slots in the interpreter 724 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 725 // long/double in gpr 726 #ifdef ASSERT 727 // Overwrite the unused slot with known junk 728 __ mov64(rax, CONST64(0xdeadffffdeadaaab)); 729 __ movptr(Address(rsp, st_off), rax); 730 #endif /* ASSERT */ 731 __ movq(Address(rsp, next_off), r); 732 } else { 733 __ movptr(Address(rsp, st_off), r); 734 } 735 } 736 } else { 737 assert(r_1->is_XMMRegister(), ""); 738 if (!r_2->is_valid()) { 739 // only a float use just part of the slot 740 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister()); 741 } else { 742 #ifdef ASSERT 743 // Overwrite the unused slot with known junk 744 __ mov64(rax, CONST64(0xdeadffffdeadaaac)); 745 __ movptr(Address(rsp, st_off), rax); 746 #endif /* ASSERT */ 747 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister()); 748 } 749 } 750 } 751 752 // Schedule the branch target address early. 753 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset()))); 754 __ jmp(rcx); 755 } 756 757 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg, 758 address code_start, address code_end, 759 Label& L_ok) { 760 Label L_fail; 761 __ lea(temp_reg, ExternalAddress(code_start)); 762 __ cmpptr(pc_reg, temp_reg); 763 __ jcc(Assembler::belowEqual, L_fail); 764 __ lea(temp_reg, ExternalAddress(code_end)); 765 __ cmpptr(pc_reg, temp_reg); 766 __ jcc(Assembler::below, L_ok); 767 __ bind(L_fail); 768 } 769 770 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm, 771 int total_args_passed, 772 int comp_args_on_stack, 773 const BasicType *sig_bt, 774 const VMRegPair *regs) { 775 776 // Note: r13 contains the senderSP on entry. We must preserve it since 777 // we may do a i2c -> c2i transition if we lose a race where compiled 778 // code goes non-entrant while we get args ready. 779 // In addition we use r13 to locate all the interpreter args as 780 // we must align the stack to 16 bytes on an i2c entry else we 781 // lose alignment we expect in all compiled code and register 782 // save code can segv when fxsave instructions find improperly 783 // aligned stack pointer. 784 785 // Adapters can be frameless because they do not require the caller 786 // to perform additional cleanup work, such as correcting the stack pointer. 787 // An i2c adapter is frameless because the *caller* frame, which is interpreted, 788 // routinely repairs its own stack pointer (from interpreter_frame_last_sp), 789 // even if a callee has modified the stack pointer. 790 // A c2i adapter is frameless because the *callee* frame, which is interpreted, 791 // routinely repairs its caller's stack pointer (from sender_sp, which is set 792 // up via the senderSP register). 793 // In other words, if *either* the caller or callee is interpreted, we can 794 // get the stack pointer repaired after a call. 795 // This is why c2i and i2c adapters cannot be indefinitely composed. 796 // In particular, if a c2i adapter were to somehow call an i2c adapter, 797 // both caller and callee would be compiled methods, and neither would 798 // clean up the stack pointer changes performed by the two adapters. 799 // If this happens, control eventually transfers back to the compiled 800 // caller, but with an uncorrected stack, causing delayed havoc. 801 802 if (VerifyAdapterCalls && 803 (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) { 804 // So, let's test for cascading c2i/i2c adapters right now. 805 // assert(Interpreter::contains($return_addr) || 806 // StubRoutines::contains($return_addr), 807 // "i2c adapter must return to an interpreter frame"); 808 __ block_comment("verify_i2c { "); 809 // Pick up the return address 810 __ movptr(rax, Address(rsp, 0)); 811 Label L_ok; 812 if (Interpreter::code() != nullptr) { 813 range_check(masm, rax, r11, 814 Interpreter::code()->code_start(), 815 Interpreter::code()->code_end(), 816 L_ok); 817 } 818 if (StubRoutines::initial_stubs_code() != nullptr) { 819 range_check(masm, rax, r11, 820 StubRoutines::initial_stubs_code()->code_begin(), 821 StubRoutines::initial_stubs_code()->code_end(), 822 L_ok); 823 } 824 if (StubRoutines::final_stubs_code() != nullptr) { 825 range_check(masm, rax, r11, 826 StubRoutines::final_stubs_code()->code_begin(), 827 StubRoutines::final_stubs_code()->code_end(), 828 L_ok); 829 } 830 const char* msg = "i2c adapter must return to an interpreter frame"; 831 __ block_comment(msg); 832 __ stop(msg); 833 __ bind(L_ok); 834 __ block_comment("} verify_i2ce "); 835 } 836 837 // Must preserve original SP for loading incoming arguments because 838 // we need to align the outgoing SP for compiled code. 839 __ movptr(r11, rsp); 840 841 // Pick up the return address 842 __ pop(rax); 843 844 // Convert 4-byte c2 stack slots to words. 845 int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord; 846 847 if (comp_args_on_stack) { 848 __ subptr(rsp, comp_words_on_stack * wordSize); 849 } 850 851 // Ensure compiled code always sees stack at proper alignment 852 __ andptr(rsp, -16); 853 854 // push the return address and misalign the stack that youngest frame always sees 855 // as far as the placement of the call instruction 856 __ push(rax); 857 858 // Put saved SP in another register 859 const Register saved_sp = rax; 860 __ movptr(saved_sp, r11); 861 862 // Will jump to the compiled code just as if compiled code was doing it. 863 // Pre-load the register-jump target early, to schedule it better. 864 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset()))); 865 866 #if INCLUDE_JVMCI 867 if (EnableJVMCI) { 868 // check if this call should be routed towards a specific entry point 869 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 870 Label no_alternative_target; 871 __ jcc(Assembler::equal, no_alternative_target); 872 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset()))); 873 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 874 __ bind(no_alternative_target); 875 } 876 #endif // INCLUDE_JVMCI 877 878 // Now generate the shuffle code. Pick up all register args and move the 879 // rest through the floating point stack top. 880 for (int i = 0; i < total_args_passed; i++) { 881 if (sig_bt[i] == T_VOID) { 882 // Longs and doubles are passed in native word order, but misaligned 883 // in the 32-bit build. 884 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 885 continue; 886 } 887 888 // Pick up 0, 1 or 2 words from SP+offset. 889 890 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), 891 "scrambled load targets?"); 892 // Load in argument order going down. 893 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize; 894 // Point to interpreter value (vs. tag) 895 int next_off = ld_off - Interpreter::stackElementSize; 896 // 897 // 898 // 899 VMReg r_1 = regs[i].first(); 900 VMReg r_2 = regs[i].second(); 901 if (!r_1->is_valid()) { 902 assert(!r_2->is_valid(), ""); 903 continue; 904 } 905 if (r_1->is_stack()) { 906 // Convert stack slot to an SP offset (+ wordSize to account for return address ) 907 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize; 908 909 // We can use r13 as a temp here because compiled code doesn't need r13 as an input 910 // and if we end up going thru a c2i because of a miss a reasonable value of r13 911 // will be generated. 912 if (!r_2->is_valid()) { 913 // sign extend??? 914 __ movl(r13, Address(saved_sp, ld_off)); 915 __ movptr(Address(rsp, st_off), r13); 916 } else { 917 // 918 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 919 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 920 // So we must adjust where to pick up the data to match the interpreter. 921 // 922 // Interpreter local[n] == MSW, local[n+1] == LSW however locals 923 // are accessed as negative so LSW is at LOW address 924 925 // ld_off is MSW so get LSW 926 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 927 next_off : ld_off; 928 __ movq(r13, Address(saved_sp, offset)); 929 // st_off is LSW (i.e. reg.first()) 930 __ movq(Address(rsp, st_off), r13); 931 } 932 } else if (r_1->is_Register()) { // Register argument 933 Register r = r_1->as_Register(); 934 assert(r != rax, "must be different"); 935 if (r_2->is_valid()) { 936 // 937 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 938 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 939 // So we must adjust where to pick up the data to match the interpreter. 940 941 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 942 next_off : ld_off; 943 944 // this can be a misaligned move 945 __ movq(r, Address(saved_sp, offset)); 946 } else { 947 // sign extend and use a full word? 948 __ movl(r, Address(saved_sp, ld_off)); 949 } 950 } else { 951 if (!r_2->is_valid()) { 952 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off)); 953 } else { 954 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off)); 955 } 956 } 957 } 958 959 __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about 960 961 // 6243940 We might end up in handle_wrong_method if 962 // the callee is deoptimized as we race thru here. If that 963 // happens we don't want to take a safepoint because the 964 // caller frame will look interpreted and arguments are now 965 // "compiled" so it is much better to make this transition 966 // invisible to the stack walking code. Unfortunately if 967 // we try and find the callee by normal means a safepoint 968 // is possible. So we stash the desired callee in the thread 969 // and the vm will find there should this case occur. 970 971 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx); 972 973 // put Method* where a c2i would expect should we end up there 974 // only needed because eof c2 resolve stubs return Method* as a result in 975 // rax 976 __ mov(rax, rbx); 977 __ jmp(r11); 978 } 979 980 // --------------------------------------------------------------- 981 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm, 982 int total_args_passed, 983 int comp_args_on_stack, 984 const BasicType *sig_bt, 985 const VMRegPair *regs, 986 AdapterFingerPrint* fingerprint) { 987 address i2c_entry = __ pc(); 988 989 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs); 990 991 // ------------------------------------------------------------------------- 992 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls 993 // to the interpreter. The args start out packed in the compiled layout. They 994 // need to be unpacked into the interpreter layout. This will almost always 995 // require some stack space. We grow the current (compiled) stack, then repack 996 // the args. We finally end in a jump to the generic interpreter entry point. 997 // On exit from the interpreter, the interpreter will restore our SP (lest the 998 // compiled code, which relies solely on SP and not RBP, get sick). 999 1000 address c2i_unverified_entry = __ pc(); 1001 Label skip_fixup; 1002 Label ok; 1003 1004 Register holder = rax; 1005 Register receiver = j_rarg0; 1006 Register temp = rbx; 1007 1008 { 1009 __ load_klass(temp, receiver, rscratch1); 1010 __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset())); 1011 __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset())); 1012 __ jcc(Assembler::equal, ok); 1013 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1014 1015 __ bind(ok); 1016 // Method might have been compiled since the call site was patched to 1017 // interpreted if that is the case treat it as a miss so we can get 1018 // the call site corrected. 1019 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 1020 __ jcc(Assembler::equal, skip_fixup); 1021 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1022 } 1023 1024 address c2i_entry = __ pc(); 1025 1026 // Class initialization barrier for static methods 1027 address c2i_no_clinit_check_entry = nullptr; 1028 if (VM_Version::supports_fast_class_init_checks()) { 1029 Label L_skip_barrier; 1030 Register method = rbx; 1031 1032 { // Bypass the barrier for non-static methods 1033 Register flags = rscratch1; 1034 __ movl(flags, Address(method, Method::access_flags_offset())); 1035 __ testl(flags, JVM_ACC_STATIC); 1036 __ jcc(Assembler::zero, L_skip_barrier); // non-static 1037 } 1038 1039 Register klass = rscratch1; 1040 __ load_method_holder(klass, method); 1041 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1042 1043 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1044 1045 __ bind(L_skip_barrier); 1046 c2i_no_clinit_check_entry = __ pc(); 1047 } 1048 1049 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1050 bs->c2i_entry_barrier(masm); 1051 1052 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup); 1053 1054 return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry); 1055 } 1056 1057 int SharedRuntime::c_calling_convention(const BasicType *sig_bt, 1058 VMRegPair *regs, 1059 VMRegPair *regs2, 1060 int total_args_passed) { 1061 assert(regs2 == nullptr, "not needed on x86"); 1062 // We return the amount of VMRegImpl stack slots we need to reserve for all 1063 // the arguments NOT counting out_preserve_stack_slots. 1064 1065 // NOTE: These arrays will have to change when c1 is ported 1066 #ifdef _WIN64 1067 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1068 c_rarg0, c_rarg1, c_rarg2, c_rarg3 1069 }; 1070 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1071 c_farg0, c_farg1, c_farg2, c_farg3 1072 }; 1073 #else 1074 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1075 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5 1076 }; 1077 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1078 c_farg0, c_farg1, c_farg2, c_farg3, 1079 c_farg4, c_farg5, c_farg6, c_farg7 1080 }; 1081 #endif // _WIN64 1082 1083 1084 uint int_args = 0; 1085 uint fp_args = 0; 1086 uint stk_args = 0; // inc by 2 each time 1087 1088 for (int i = 0; i < total_args_passed; i++) { 1089 switch (sig_bt[i]) { 1090 case T_BOOLEAN: 1091 case T_CHAR: 1092 case T_BYTE: 1093 case T_SHORT: 1094 case T_INT: 1095 if (int_args < Argument::n_int_register_parameters_c) { 1096 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 1097 #ifdef _WIN64 1098 fp_args++; 1099 // Allocate slots for callee to stuff register args the stack. 1100 stk_args += 2; 1101 #endif 1102 } else { 1103 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1104 stk_args += 2; 1105 } 1106 break; 1107 case T_LONG: 1108 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1109 // fall through 1110 case T_OBJECT: 1111 case T_ARRAY: 1112 case T_ADDRESS: 1113 case T_METADATA: 1114 if (int_args < Argument::n_int_register_parameters_c) { 1115 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 1116 #ifdef _WIN64 1117 fp_args++; 1118 stk_args += 2; 1119 #endif 1120 } else { 1121 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1122 stk_args += 2; 1123 } 1124 break; 1125 case T_FLOAT: 1126 if (fp_args < Argument::n_float_register_parameters_c) { 1127 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 1128 #ifdef _WIN64 1129 int_args++; 1130 // Allocate slots for callee to stuff register args the stack. 1131 stk_args += 2; 1132 #endif 1133 } else { 1134 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1135 stk_args += 2; 1136 } 1137 break; 1138 case T_DOUBLE: 1139 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1140 if (fp_args < Argument::n_float_register_parameters_c) { 1141 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 1142 #ifdef _WIN64 1143 int_args++; 1144 // Allocate slots for callee to stuff register args the stack. 1145 stk_args += 2; 1146 #endif 1147 } else { 1148 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1149 stk_args += 2; 1150 } 1151 break; 1152 case T_VOID: // Halves of longs and doubles 1153 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 1154 regs[i].set_bad(); 1155 break; 1156 default: 1157 ShouldNotReachHere(); 1158 break; 1159 } 1160 } 1161 #ifdef _WIN64 1162 // windows abi requires that we always allocate enough stack space 1163 // for 4 64bit registers to be stored down. 1164 if (stk_args < 8) { 1165 stk_args = 8; 1166 } 1167 #endif // _WIN64 1168 1169 return stk_args; 1170 } 1171 1172 int SharedRuntime::vector_calling_convention(VMRegPair *regs, 1173 uint num_bits, 1174 uint total_args_passed) { 1175 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512, 1176 "only certain vector sizes are supported for now"); 1177 1178 static const XMMRegister VEC_ArgReg[32] = { 1179 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 1180 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1181 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23, 1182 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31 1183 }; 1184 1185 uint stk_args = 0; 1186 uint fp_args = 0; 1187 1188 for (uint i = 0; i < total_args_passed; i++) { 1189 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg(); 1190 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15)); 1191 regs[i].set_pair(vmreg->next(next_val), vmreg); 1192 } 1193 1194 return stk_args; 1195 } 1196 1197 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1198 // We always ignore the frame_slots arg and just use the space just below frame pointer 1199 // which by this time is free to use 1200 switch (ret_type) { 1201 case T_FLOAT: 1202 __ movflt(Address(rbp, -wordSize), xmm0); 1203 break; 1204 case T_DOUBLE: 1205 __ movdbl(Address(rbp, -wordSize), xmm0); 1206 break; 1207 case T_VOID: break; 1208 default: { 1209 __ movptr(Address(rbp, -wordSize), rax); 1210 } 1211 } 1212 } 1213 1214 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1215 // We always ignore the frame_slots arg and just use the space just below frame pointer 1216 // which by this time is free to use 1217 switch (ret_type) { 1218 case T_FLOAT: 1219 __ movflt(xmm0, Address(rbp, -wordSize)); 1220 break; 1221 case T_DOUBLE: 1222 __ movdbl(xmm0, Address(rbp, -wordSize)); 1223 break; 1224 case T_VOID: break; 1225 default: { 1226 __ movptr(rax, Address(rbp, -wordSize)); 1227 } 1228 } 1229 } 1230 1231 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1232 for ( int i = first_arg ; i < arg_count ; i++ ) { 1233 if (args[i].first()->is_Register()) { 1234 __ push(args[i].first()->as_Register()); 1235 } else if (args[i].first()->is_XMMRegister()) { 1236 __ subptr(rsp, 2*wordSize); 1237 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister()); 1238 } 1239 } 1240 } 1241 1242 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1243 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) { 1244 if (args[i].first()->is_Register()) { 1245 __ pop(args[i].first()->as_Register()); 1246 } else if (args[i].first()->is_XMMRegister()) { 1247 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0)); 1248 __ addptr(rsp, 2*wordSize); 1249 } 1250 } 1251 } 1252 1253 static void verify_oop_args(MacroAssembler* masm, 1254 const methodHandle& method, 1255 const BasicType* sig_bt, 1256 const VMRegPair* regs) { 1257 Register temp_reg = rbx; // not part of any compiled calling seq 1258 if (VerifyOops) { 1259 for (int i = 0; i < method->size_of_parameters(); i++) { 1260 if (is_reference_type(sig_bt[i])) { 1261 VMReg r = regs[i].first(); 1262 assert(r->is_valid(), "bad oop arg"); 1263 if (r->is_stack()) { 1264 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1265 __ verify_oop(temp_reg); 1266 } else { 1267 __ verify_oop(r->as_Register()); 1268 } 1269 } 1270 } 1271 } 1272 } 1273 1274 static void check_continuation_enter_argument(VMReg actual_vmreg, 1275 Register expected_reg, 1276 const char* name) { 1277 assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name); 1278 assert(actual_vmreg->as_Register() == expected_reg, 1279 "%s is in unexpected register: %s instead of %s", 1280 name, actual_vmreg->as_Register()->name(), expected_reg->name()); 1281 } 1282 1283 1284 //---------------------------- continuation_enter_setup --------------------------- 1285 // 1286 // Arguments: 1287 // None. 1288 // 1289 // Results: 1290 // rsp: pointer to blank ContinuationEntry 1291 // 1292 // Kills: 1293 // rax 1294 // 1295 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) { 1296 assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, ""); 1297 assert(in_bytes(ContinuationEntry::cont_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1298 assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1299 1300 stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize; 1301 __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1302 1303 int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size; 1304 OopMap* map = new OopMap(frame_size, 0); 1305 1306 __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset())); 1307 __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax); 1308 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp); 1309 1310 return map; 1311 } 1312 1313 //---------------------------- fill_continuation_entry --------------------------- 1314 // 1315 // Arguments: 1316 // rsp: pointer to blank Continuation entry 1317 // reg_cont_obj: pointer to the continuation 1318 // reg_flags: flags 1319 // 1320 // Results: 1321 // rsp: pointer to filled out ContinuationEntry 1322 // 1323 // Kills: 1324 // rax 1325 // 1326 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) { 1327 assert_different_registers(rax, reg_cont_obj, reg_flags); 1328 #ifdef ASSERT 1329 __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value()); 1330 #endif 1331 __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj); 1332 __ movl (Address(rsp, ContinuationEntry::flags_offset()), reg_flags); 1333 __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0); 1334 __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0); 1335 __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0); 1336 1337 __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset())); 1338 __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax); 1339 __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset())); 1340 __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax); 1341 1342 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0); 1343 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0); 1344 } 1345 1346 //---------------------------- continuation_enter_cleanup --------------------------- 1347 // 1348 // Arguments: 1349 // rsp: pointer to the ContinuationEntry 1350 // 1351 // Results: 1352 // rsp: pointer to the spilled rbp in the entry frame 1353 // 1354 // Kills: 1355 // rbx 1356 // 1357 void static continuation_enter_cleanup(MacroAssembler* masm) { 1358 #ifdef ASSERT 1359 Label L_good_sp; 1360 __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1361 __ jcc(Assembler::equal, L_good_sp); 1362 __ stop("Incorrect rsp at continuation_enter_cleanup"); 1363 __ bind(L_good_sp); 1364 #endif 1365 1366 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset())); 1367 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx); 1368 __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset())); 1369 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx); 1370 1371 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset())); 1372 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx); 1373 __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1374 } 1375 1376 static void gen_continuation_enter(MacroAssembler* masm, 1377 const VMRegPair* regs, 1378 int& exception_offset, 1379 OopMapSet* oop_maps, 1380 int& frame_complete, 1381 int& stack_slots, 1382 int& interpreted_entry_offset, 1383 int& compiled_entry_offset) { 1384 1385 // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread) 1386 int pos_cont_obj = 0; 1387 int pos_is_cont = 1; 1388 int pos_is_virtual = 2; 1389 1390 // The platform-specific calling convention may present the arguments in various registers. 1391 // To simplify the rest of the code, we expect the arguments to reside at these known 1392 // registers, and we additionally check the placement here in case calling convention ever 1393 // changes. 1394 Register reg_cont_obj = c_rarg1; 1395 Register reg_is_cont = c_rarg2; 1396 Register reg_is_virtual = c_rarg3; 1397 1398 check_continuation_enter_argument(regs[pos_cont_obj].first(), reg_cont_obj, "Continuation object"); 1399 check_continuation_enter_argument(regs[pos_is_cont].first(), reg_is_cont, "isContinue"); 1400 check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread"); 1401 1402 // Utility methods kill rax, make sure there are no collisions 1403 assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual); 1404 1405 AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(), 1406 relocInfo::static_call_type); 1407 1408 address start = __ pc(); 1409 1410 Label L_thaw, L_exit; 1411 1412 // i2i entry used at interp_only_mode only 1413 interpreted_entry_offset = __ pc() - start; 1414 { 1415 #ifdef ASSERT 1416 Label is_interp_only; 1417 __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0); 1418 __ jcc(Assembler::notEqual, is_interp_only); 1419 __ stop("enterSpecial interpreter entry called when not in interp_only_mode"); 1420 __ bind(is_interp_only); 1421 #endif 1422 1423 __ pop(rax); // return address 1424 // Read interpreter arguments into registers (this is an ad-hoc i2c adapter) 1425 __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2)); 1426 __ movl(c_rarg2, Address(rsp, Interpreter::stackElementSize*1)); 1427 __ movl(c_rarg3, Address(rsp, Interpreter::stackElementSize*0)); 1428 __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment 1429 __ push(rax); // return address 1430 __ push_cont_fastpath(); 1431 1432 __ enter(); 1433 1434 stack_slots = 2; // will be adjusted in setup 1435 OopMap* map = continuation_enter_setup(masm, stack_slots); 1436 // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe, 1437 // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway. 1438 1439 __ verify_oop(reg_cont_obj); 1440 1441 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1442 1443 // If continuation, call to thaw. Otherwise, resolve the call and exit. 1444 __ testptr(reg_is_cont, reg_is_cont); 1445 __ jcc(Assembler::notZero, L_thaw); 1446 1447 // --- Resolve path 1448 1449 // Make sure the call is patchable 1450 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1451 // Emit stub for static call 1452 CodeBuffer* cbuf = masm->code_section()->outer(); 1453 address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, __ pc()); 1454 if (stub == nullptr) { 1455 fatal("CodeCache is full at gen_continuation_enter"); 1456 } 1457 __ call(resolve); 1458 oop_maps->add_gc_map(__ pc() - start, map); 1459 __ post_call_nop(); 1460 1461 __ jmp(L_exit); 1462 } 1463 1464 // compiled entry 1465 __ align(CodeEntryAlignment); 1466 compiled_entry_offset = __ pc() - start; 1467 __ enter(); 1468 1469 stack_slots = 2; // will be adjusted in setup 1470 OopMap* map = continuation_enter_setup(masm, stack_slots); 1471 1472 // Frame is now completed as far as size and linkage. 1473 frame_complete = __ pc() - start; 1474 1475 __ verify_oop(reg_cont_obj); 1476 1477 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1478 1479 // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue) 1480 __ testptr(reg_is_cont, reg_is_cont); 1481 __ jccb(Assembler::notZero, L_thaw); 1482 1483 // --- call Continuation.enter(Continuation c, boolean isContinue) 1484 1485 // Make sure the call is patchable 1486 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1487 1488 // Emit stub for static call 1489 CodeBuffer* cbuf = masm->code_section()->outer(); 1490 address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, __ pc()); 1491 if (stub == nullptr) { 1492 fatal("CodeCache is full at gen_continuation_enter"); 1493 } 1494 1495 // The call needs to be resolved. There's a special case for this in 1496 // SharedRuntime::find_callee_info_helper() which calls 1497 // LinkResolver::resolve_continuation_enter() which resolves the call to 1498 // Continuation.enter(Continuation c, boolean isContinue). 1499 __ call(resolve); 1500 1501 oop_maps->add_gc_map(__ pc() - start, map); 1502 __ post_call_nop(); 1503 1504 __ jmpb(L_exit); 1505 1506 // --- Thawing path 1507 1508 __ bind(L_thaw); 1509 1510 __ call(RuntimeAddress(StubRoutines::cont_thaw())); 1511 1512 ContinuationEntry::_return_pc_offset = __ pc() - start; 1513 oop_maps->add_gc_map(__ pc() - start, map->deep_copy()); 1514 __ post_call_nop(); 1515 1516 // --- Normal exit (resolve/thawing) 1517 1518 __ bind(L_exit); 1519 1520 continuation_enter_cleanup(masm); 1521 __ pop(rbp); 1522 __ ret(0); 1523 1524 // --- Exception handling path 1525 1526 exception_offset = __ pc() - start; 1527 1528 continuation_enter_cleanup(masm); 1529 __ pop(rbp); 1530 1531 __ movptr(c_rarg0, r15_thread); 1532 __ movptr(c_rarg1, Address(rsp, 0)); // return address 1533 1534 // rax still holds the original exception oop, save it before the call 1535 __ push(rax); 1536 1537 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2); 1538 __ movptr(rbx, rax); 1539 1540 // Continue at exception handler: 1541 // rax: exception oop 1542 // rbx: exception handler 1543 // rdx: exception pc 1544 __ pop(rax); 1545 __ verify_oop(rax); 1546 __ pop(rdx); 1547 __ jmp(rbx); 1548 } 1549 1550 static void gen_continuation_yield(MacroAssembler* masm, 1551 const VMRegPair* regs, 1552 OopMapSet* oop_maps, 1553 int& frame_complete, 1554 int& stack_slots, 1555 int& compiled_entry_offset) { 1556 enum layout { 1557 rbp_off, 1558 rbpH_off, 1559 return_off, 1560 return_off2, 1561 framesize // inclusive of return address 1562 }; 1563 stack_slots = framesize / VMRegImpl::slots_per_word; 1564 assert(stack_slots == 2, "recheck layout"); 1565 1566 address start = __ pc(); 1567 compiled_entry_offset = __ pc() - start; 1568 __ enter(); 1569 address the_pc = __ pc(); 1570 1571 frame_complete = the_pc - start; 1572 1573 // This nop must be exactly at the PC we push into the frame info. 1574 // We use this nop for fast CodeBlob lookup, associate the OopMap 1575 // with it right away. 1576 __ post_call_nop(); 1577 OopMap* map = new OopMap(framesize, 1); 1578 oop_maps->add_gc_map(frame_complete, map); 1579 1580 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 1581 __ movptr(c_rarg0, r15_thread); 1582 __ movptr(c_rarg1, rsp); 1583 __ call_VM_leaf(Continuation::freeze_entry(), 2); 1584 __ reset_last_Java_frame(true); 1585 1586 Label L_pinned; 1587 1588 __ testptr(rax, rax); 1589 __ jcc(Assembler::notZero, L_pinned); 1590 1591 __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1592 continuation_enter_cleanup(masm); 1593 __ pop(rbp); 1594 __ ret(0); 1595 1596 __ bind(L_pinned); 1597 1598 // Pinned, return to caller 1599 1600 // handle pending exception thrown by freeze 1601 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 1602 Label ok; 1603 __ jcc(Assembler::equal, ok); 1604 __ leave(); 1605 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 1606 __ bind(ok); 1607 1608 __ leave(); 1609 __ ret(0); 1610 } 1611 1612 static void gen_special_dispatch(MacroAssembler* masm, 1613 const methodHandle& method, 1614 const BasicType* sig_bt, 1615 const VMRegPair* regs) { 1616 verify_oop_args(masm, method, sig_bt, regs); 1617 vmIntrinsics::ID iid = method->intrinsic_id(); 1618 1619 // Now write the args into the outgoing interpreter space 1620 bool has_receiver = false; 1621 Register receiver_reg = noreg; 1622 int member_arg_pos = -1; 1623 Register member_reg = noreg; 1624 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid); 1625 if (ref_kind != 0) { 1626 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument 1627 member_reg = rbx; // known to be free at this point 1628 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind); 1629 } else if (iid == vmIntrinsics::_invokeBasic) { 1630 has_receiver = true; 1631 } else if (iid == vmIntrinsics::_linkToNative) { 1632 member_arg_pos = method->size_of_parameters() - 1; // trailing NativeEntryPoint argument 1633 member_reg = rbx; // known to be free at this point 1634 } else { 1635 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid)); 1636 } 1637 1638 if (member_reg != noreg) { 1639 // Load the member_arg into register, if necessary. 1640 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs); 1641 VMReg r = regs[member_arg_pos].first(); 1642 if (r->is_stack()) { 1643 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1644 } else { 1645 // no data motion is needed 1646 member_reg = r->as_Register(); 1647 } 1648 } 1649 1650 if (has_receiver) { 1651 // Make sure the receiver is loaded into a register. 1652 assert(method->size_of_parameters() > 0, "oob"); 1653 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object"); 1654 VMReg r = regs[0].first(); 1655 assert(r->is_valid(), "bad receiver arg"); 1656 if (r->is_stack()) { 1657 // Porting note: This assumes that compiled calling conventions always 1658 // pass the receiver oop in a register. If this is not true on some 1659 // platform, pick a temp and load the receiver from stack. 1660 fatal("receiver always in a register"); 1661 receiver_reg = j_rarg0; // known to be free at this point 1662 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1663 } else { 1664 // no data motion is needed 1665 receiver_reg = r->as_Register(); 1666 } 1667 } 1668 1669 // Figure out which address we are really jumping to: 1670 MethodHandles::generate_method_handle_dispatch(masm, iid, 1671 receiver_reg, member_reg, /*for_compiler_entry:*/ true); 1672 } 1673 1674 // --------------------------------------------------------------------------- 1675 // Generate a native wrapper for a given method. The method takes arguments 1676 // in the Java compiled code convention, marshals them to the native 1677 // convention (handlizes oops, etc), transitions to native, makes the call, 1678 // returns to java state (possibly blocking), unhandlizes any result and 1679 // returns. 1680 // 1681 // Critical native functions are a shorthand for the use of 1682 // GetPrimtiveArrayCritical and disallow the use of any other JNI 1683 // functions. The wrapper is expected to unpack the arguments before 1684 // passing them to the callee. Critical native functions leave the state _in_Java, 1685 // since they cannot stop for GC. 1686 // Some other parts of JNI setup are skipped like the tear down of the JNI handle 1687 // block and the check for pending exceptions it's impossible for them 1688 // to be thrown. 1689 // 1690 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm, 1691 const methodHandle& method, 1692 int compile_id, 1693 BasicType* in_sig_bt, 1694 VMRegPair* in_regs, 1695 BasicType ret_type) { 1696 if (method->is_continuation_native_intrinsic()) { 1697 int exception_offset = -1; 1698 OopMapSet* oop_maps = new OopMapSet(); 1699 int frame_complete = -1; 1700 int stack_slots = -1; 1701 int interpreted_entry_offset = -1; 1702 int vep_offset = -1; 1703 if (method->is_continuation_enter_intrinsic()) { 1704 gen_continuation_enter(masm, 1705 in_regs, 1706 exception_offset, 1707 oop_maps, 1708 frame_complete, 1709 stack_slots, 1710 interpreted_entry_offset, 1711 vep_offset); 1712 } else if (method->is_continuation_yield_intrinsic()) { 1713 gen_continuation_yield(masm, 1714 in_regs, 1715 oop_maps, 1716 frame_complete, 1717 stack_slots, 1718 vep_offset); 1719 } else { 1720 guarantee(false, "Unknown Continuation native intrinsic"); 1721 } 1722 1723 #ifdef ASSERT 1724 if (method->is_continuation_enter_intrinsic()) { 1725 assert(interpreted_entry_offset != -1, "Must be set"); 1726 assert(exception_offset != -1, "Must be set"); 1727 } else { 1728 assert(interpreted_entry_offset == -1, "Must be unset"); 1729 assert(exception_offset == -1, "Must be unset"); 1730 } 1731 assert(frame_complete != -1, "Must be set"); 1732 assert(stack_slots != -1, "Must be set"); 1733 assert(vep_offset != -1, "Must be set"); 1734 #endif 1735 1736 __ flush(); 1737 nmethod* nm = nmethod::new_native_nmethod(method, 1738 compile_id, 1739 masm->code(), 1740 vep_offset, 1741 frame_complete, 1742 stack_slots, 1743 in_ByteSize(-1), 1744 in_ByteSize(-1), 1745 oop_maps, 1746 exception_offset); 1747 if (method->is_continuation_enter_intrinsic()) { 1748 ContinuationEntry::set_enter_code(nm, interpreted_entry_offset); 1749 } else if (method->is_continuation_yield_intrinsic()) { 1750 _cont_doYield_stub = nm; 1751 } 1752 return nm; 1753 } 1754 1755 if (method->is_method_handle_intrinsic()) { 1756 vmIntrinsics::ID iid = method->intrinsic_id(); 1757 intptr_t start = (intptr_t)__ pc(); 1758 int vep_offset = ((intptr_t)__ pc()) - start; 1759 gen_special_dispatch(masm, 1760 method, 1761 in_sig_bt, 1762 in_regs); 1763 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period 1764 __ flush(); 1765 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually 1766 return nmethod::new_native_nmethod(method, 1767 compile_id, 1768 masm->code(), 1769 vep_offset, 1770 frame_complete, 1771 stack_slots / VMRegImpl::slots_per_word, 1772 in_ByteSize(-1), 1773 in_ByteSize(-1), 1774 nullptr); 1775 } 1776 address native_func = method->native_function(); 1777 assert(native_func != nullptr, "must have function"); 1778 1779 // An OopMap for lock (and class if static) 1780 OopMapSet *oop_maps = new OopMapSet(); 1781 intptr_t start = (intptr_t)__ pc(); 1782 1783 // We have received a description of where all the java arg are located 1784 // on entry to the wrapper. We need to convert these args to where 1785 // the jni function will expect them. To figure out where they go 1786 // we convert the java signature to a C signature by inserting 1787 // the hidden arguments as arg[0] and possibly arg[1] (static method) 1788 1789 const int total_in_args = method->size_of_parameters(); 1790 int total_c_args = total_in_args + (method->is_static() ? 2 : 1); 1791 1792 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args); 1793 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args); 1794 BasicType* in_elem_bt = nullptr; 1795 1796 int argc = 0; 1797 out_sig_bt[argc++] = T_ADDRESS; 1798 if (method->is_static()) { 1799 out_sig_bt[argc++] = T_OBJECT; 1800 } 1801 1802 for (int i = 0; i < total_in_args ; i++ ) { 1803 out_sig_bt[argc++] = in_sig_bt[i]; 1804 } 1805 1806 // Now figure out where the args must be stored and how much stack space 1807 // they require. 1808 int out_arg_slots; 1809 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, nullptr, total_c_args); 1810 1811 // Compute framesize for the wrapper. We need to handlize all oops in 1812 // incoming registers 1813 1814 // Calculate the total number of stack slots we will need. 1815 1816 // First count the abi requirement plus all of the outgoing args 1817 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots; 1818 1819 // Now the space for the inbound oop handle area 1820 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers 1821 1822 int oop_handle_offset = stack_slots; 1823 stack_slots += total_save_slots; 1824 1825 // Now any space we need for handlizing a klass if static method 1826 1827 int klass_slot_offset = 0; 1828 int klass_offset = -1; 1829 int lock_slot_offset = 0; 1830 bool is_static = false; 1831 1832 if (method->is_static()) { 1833 klass_slot_offset = stack_slots; 1834 stack_slots += VMRegImpl::slots_per_word; 1835 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size; 1836 is_static = true; 1837 } 1838 1839 // Plus a lock if needed 1840 1841 if (method->is_synchronized()) { 1842 lock_slot_offset = stack_slots; 1843 stack_slots += VMRegImpl::slots_per_word; 1844 } 1845 1846 // Now a place (+2) to save return values or temp during shuffling 1847 // + 4 for return address (which we own) and saved rbp 1848 stack_slots += 6; 1849 1850 // Ok The space we have allocated will look like: 1851 // 1852 // 1853 // FP-> | | 1854 // |---------------------| 1855 // | 2 slots for moves | 1856 // |---------------------| 1857 // | lock box (if sync) | 1858 // |---------------------| <- lock_slot_offset 1859 // | klass (if static) | 1860 // |---------------------| <- klass_slot_offset 1861 // | oopHandle area | 1862 // |---------------------| <- oop_handle_offset (6 java arg registers) 1863 // | outbound memory | 1864 // | based arguments | 1865 // | | 1866 // |---------------------| 1867 // | | 1868 // SP-> | out_preserved_slots | 1869 // 1870 // 1871 1872 1873 // Now compute actual number of stack words we need rounding to make 1874 // stack properly aligned. 1875 stack_slots = align_up(stack_slots, StackAlignmentInSlots); 1876 1877 int stack_size = stack_slots * VMRegImpl::stack_slot_size; 1878 1879 // First thing make an ic check to see if we should even be here 1880 1881 // We are free to use all registers as temps without saving them and 1882 // restoring them except rbp. rbp is the only callee save register 1883 // as far as the interpreter and the compiler(s) are concerned. 1884 1885 1886 const Register ic_reg = rax; 1887 const Register receiver = j_rarg0; 1888 1889 Label hit; 1890 Label exception_pending; 1891 1892 assert_different_registers(ic_reg, receiver, rscratch1, rscratch2); 1893 __ verify_oop(receiver); 1894 __ load_klass(rscratch1, receiver, rscratch2); 1895 __ cmpq(ic_reg, rscratch1); 1896 __ jcc(Assembler::equal, hit); 1897 1898 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1899 1900 // Verified entry point must be aligned 1901 __ align(8); 1902 1903 __ bind(hit); 1904 1905 int vep_offset = ((intptr_t)__ pc()) - start; 1906 1907 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) { 1908 Label L_skip_barrier; 1909 Register klass = r10; 1910 __ mov_metadata(klass, method->method_holder()); // InstanceKlass* 1911 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1912 1913 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1914 1915 __ bind(L_skip_barrier); 1916 } 1917 1918 #ifdef COMPILER1 1919 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available. 1920 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) { 1921 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/); 1922 } 1923 #endif // COMPILER1 1924 1925 // The instruction at the verified entry point must be 5 bytes or longer 1926 // because it can be patched on the fly by make_non_entrant. The stack bang 1927 // instruction fits that requirement. 1928 1929 // Generate stack overflow check 1930 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size()); 1931 1932 // Generate a new frame for the wrapper. 1933 __ enter(); 1934 // -2 because return address is already present and so is saved rbp 1935 __ subptr(rsp, stack_size - 2*wordSize); 1936 1937 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1938 // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub 1939 bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */); 1940 1941 // Frame is now completed as far as size and linkage. 1942 int frame_complete = ((intptr_t)__ pc()) - start; 1943 1944 if (UseRTMLocking) { 1945 // Abort RTM transaction before calling JNI 1946 // because critical section will be large and will be 1947 // aborted anyway. Also nmethod could be deoptimized. 1948 __ xabort(0); 1949 } 1950 1951 #ifdef ASSERT 1952 __ check_stack_alignment(rsp, "improperly aligned stack"); 1953 #endif /* ASSERT */ 1954 1955 1956 // We use r14 as the oop handle for the receiver/klass 1957 // It is callee save so it survives the call to native 1958 1959 const Register oop_handle_reg = r14; 1960 1961 // 1962 // We immediately shuffle the arguments so that any vm call we have to 1963 // make from here on out (sync slow path, jvmti, etc.) we will have 1964 // captured the oops from our caller and have a valid oopMap for 1965 // them. 1966 1967 // ----------------- 1968 // The Grand Shuffle 1969 1970 // The Java calling convention is either equal (linux) or denser (win64) than the 1971 // c calling convention. However the because of the jni_env argument the c calling 1972 // convention always has at least one more (and two for static) arguments than Java. 1973 // Therefore if we move the args from java -> c backwards then we will never have 1974 // a register->register conflict and we don't have to build a dependency graph 1975 // and figure out how to break any cycles. 1976 // 1977 1978 // Record esp-based slot for receiver on stack for non-static methods 1979 int receiver_offset = -1; 1980 1981 // This is a trick. We double the stack slots so we can claim 1982 // the oops in the caller's frame. Since we are sure to have 1983 // more args than the caller doubling is enough to make 1984 // sure we can capture all the incoming oop args from the 1985 // caller. 1986 // 1987 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/); 1988 1989 // Mark location of rbp (someday) 1990 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp)); 1991 1992 // Use eax, ebx as temporaries during any memory-memory moves we have to do 1993 // All inbound args are referenced based on rbp and all outbound args via rsp. 1994 1995 1996 #ifdef ASSERT 1997 bool reg_destroyed[Register::number_of_registers]; 1998 bool freg_destroyed[XMMRegister::number_of_registers]; 1999 for ( int r = 0 ; r < Register::number_of_registers ; r++ ) { 2000 reg_destroyed[r] = false; 2001 } 2002 for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) { 2003 freg_destroyed[f] = false; 2004 } 2005 2006 #endif /* ASSERT */ 2007 2008 // For JNI natives the incoming and outgoing registers are offset upwards. 2009 GrowableArray<int> arg_order(2 * total_in_args); 2010 2011 VMRegPair tmp_vmreg; 2012 tmp_vmreg.set2(rbx->as_VMReg()); 2013 2014 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 2015 arg_order.push(i); 2016 arg_order.push(c_arg); 2017 } 2018 2019 int temploc = -1; 2020 for (int ai = 0; ai < arg_order.length(); ai += 2) { 2021 int i = arg_order.at(ai); 2022 int c_arg = arg_order.at(ai + 1); 2023 __ block_comment(err_msg("move %d -> %d", i, c_arg)); 2024 #ifdef ASSERT 2025 if (in_regs[i].first()->is_Register()) { 2026 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!"); 2027 } else if (in_regs[i].first()->is_XMMRegister()) { 2028 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!"); 2029 } 2030 if (out_regs[c_arg].first()->is_Register()) { 2031 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 2032 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 2033 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 2034 } 2035 #endif /* ASSERT */ 2036 switch (in_sig_bt[i]) { 2037 case T_ARRAY: 2038 case T_OBJECT: 2039 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg], 2040 ((i == 0) && (!is_static)), 2041 &receiver_offset); 2042 break; 2043 case T_VOID: 2044 break; 2045 2046 case T_FLOAT: 2047 __ float_move(in_regs[i], out_regs[c_arg]); 2048 break; 2049 2050 case T_DOUBLE: 2051 assert( i + 1 < total_in_args && 2052 in_sig_bt[i + 1] == T_VOID && 2053 out_sig_bt[c_arg+1] == T_VOID, "bad arg list"); 2054 __ double_move(in_regs[i], out_regs[c_arg]); 2055 break; 2056 2057 case T_LONG : 2058 __ long_move(in_regs[i], out_regs[c_arg]); 2059 break; 2060 2061 case T_ADDRESS: assert(false, "found T_ADDRESS in java args"); 2062 2063 default: 2064 __ move32_64(in_regs[i], out_regs[c_arg]); 2065 } 2066 } 2067 2068 int c_arg; 2069 2070 // Pre-load a static method's oop into r14. Used both by locking code and 2071 // the normal JNI call code. 2072 // point c_arg at the first arg that is already loaded in case we 2073 // need to spill before we call out 2074 c_arg = total_c_args - total_in_args; 2075 2076 if (method->is_static()) { 2077 2078 // load oop into a register 2079 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror())); 2080 2081 // Now handlize the static class mirror it's known not-null. 2082 __ movptr(Address(rsp, klass_offset), oop_handle_reg); 2083 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset)); 2084 2085 // Now get the handle 2086 __ lea(oop_handle_reg, Address(rsp, klass_offset)); 2087 // store the klass handle as second argument 2088 __ movptr(c_rarg1, oop_handle_reg); 2089 // and protect the arg if we must spill 2090 c_arg--; 2091 } 2092 2093 // Change state to native (we save the return address in the thread, since it might not 2094 // be pushed on the stack when we do a stack traversal). It is enough that the pc() 2095 // points into the right code segment. It does not have to be the correct return pc. 2096 // We use the same pc/oopMap repeatedly when we call out 2097 2098 intptr_t the_pc = (intptr_t) __ pc(); 2099 oop_maps->add_gc_map(the_pc - start, map); 2100 2101 __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1); 2102 2103 2104 // We have all of the arguments setup at this point. We must not touch any register 2105 // argument registers at this point (what if we save/restore them there are no oop? 2106 2107 { 2108 SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1); 2109 // protect the args we've loaded 2110 save_args(masm, total_c_args, c_arg, out_regs); 2111 __ mov_metadata(c_rarg1, method()); 2112 __ call_VM_leaf( 2113 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry), 2114 r15_thread, c_rarg1); 2115 restore_args(masm, total_c_args, c_arg, out_regs); 2116 } 2117 2118 // RedefineClasses() tracing support for obsolete method entry 2119 if (log_is_enabled(Trace, redefine, class, obsolete)) { 2120 // protect the args we've loaded 2121 save_args(masm, total_c_args, c_arg, out_regs); 2122 __ mov_metadata(c_rarg1, method()); 2123 __ call_VM_leaf( 2124 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry), 2125 r15_thread, c_rarg1); 2126 restore_args(masm, total_c_args, c_arg, out_regs); 2127 } 2128 2129 // Lock a synchronized method 2130 2131 // Register definitions used by locking and unlocking 2132 2133 const Register swap_reg = rax; // Must use rax for cmpxchg instruction 2134 const Register obj_reg = rbx; // Will contain the oop 2135 const Register lock_reg = r13; // Address of compiler lock object (BasicLock) 2136 const Register old_hdr = r13; // value of old header at unlock time 2137 2138 Label slow_path_lock; 2139 Label lock_done; 2140 2141 if (method->is_synchronized()) { 2142 Label count_mon; 2143 2144 const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes(); 2145 2146 // Get the handle (the 2nd argument) 2147 __ mov(oop_handle_reg, c_rarg1); 2148 2149 // Get address of the box 2150 2151 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2152 2153 // Load the oop from the handle 2154 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2155 2156 if (LockingMode == LM_MONITOR) { 2157 __ jmp(slow_path_lock); 2158 } else if (LockingMode == LM_LEGACY) { 2159 // Load immediate 1 into swap_reg %rax 2160 __ movl(swap_reg, 1); 2161 2162 // Load (object->mark() | 1) into swap_reg %rax 2163 __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2164 2165 // Save (object->mark() | 1) into BasicLock's displaced header 2166 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2167 2168 // src -> dest iff dest == rax else rax <- dest 2169 __ lock(); 2170 __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2171 __ jcc(Assembler::equal, count_mon); 2172 2173 // Hmm should this move to the slow path code area??? 2174 2175 // Test if the oopMark is an obvious stack pointer, i.e., 2176 // 1) (mark & 3) == 0, and 2177 // 2) rsp <= mark < mark + os::pagesize() 2178 // These 3 tests can be done by evaluating the following 2179 // expression: ((mark - rsp) & (3 - os::vm_page_size())), 2180 // assuming both stack pointer and pagesize have their 2181 // least significant 2 bits clear. 2182 // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg 2183 2184 __ subptr(swap_reg, rsp); 2185 __ andptr(swap_reg, 3 - (int)os::vm_page_size()); 2186 2187 // Save the test result, for recursive case, the result is zero 2188 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2189 __ jcc(Assembler::notEqual, slow_path_lock); 2190 } else { 2191 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2192 // Load object header 2193 __ movptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2194 __ lightweight_lock(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock); 2195 } 2196 __ bind(count_mon); 2197 __ inc_held_monitor_count(); 2198 2199 // Slow path will re-enter here 2200 __ bind(lock_done); 2201 } 2202 2203 // Finally just about ready to make the JNI call 2204 2205 // get JNIEnv* which is first argument to native 2206 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset()))); 2207 2208 // Now set thread in native 2209 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 2210 2211 __ call(RuntimeAddress(native_func)); 2212 2213 // Verify or restore cpu control state after JNI call 2214 __ restore_cpu_control_state_after_jni(rscratch1); 2215 2216 // Unpack native results. 2217 switch (ret_type) { 2218 case T_BOOLEAN: __ c2bool(rax); break; 2219 case T_CHAR : __ movzwl(rax, rax); break; 2220 case T_BYTE : __ sign_extend_byte (rax); break; 2221 case T_SHORT : __ sign_extend_short(rax); break; 2222 case T_INT : /* nothing to do */ break; 2223 case T_DOUBLE : 2224 case T_FLOAT : 2225 // Result is in xmm0 we'll save as needed 2226 break; 2227 case T_ARRAY: // Really a handle 2228 case T_OBJECT: // Really a handle 2229 break; // can't de-handlize until after safepoint check 2230 case T_VOID: break; 2231 case T_LONG: break; 2232 default : ShouldNotReachHere(); 2233 } 2234 2235 Label after_transition; 2236 2237 // Switch thread to "native transition" state before reading the synchronization state. 2238 // This additional state is necessary because reading and testing the synchronization 2239 // state is not atomic w.r.t. GC, as this scenario demonstrates: 2240 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted. 2241 // VM thread changes sync state to synchronizing and suspends threads for GC. 2242 // Thread A is resumed to finish this native method, but doesn't block here since it 2243 // didn't see any synchronization is progress, and escapes. 2244 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 2245 2246 // Force this write out before the read below 2247 if (!UseSystemMemoryBarrier) { 2248 __ membar(Assembler::Membar_mask_bits( 2249 Assembler::LoadLoad | Assembler::LoadStore | 2250 Assembler::StoreLoad | Assembler::StoreStore)); 2251 } 2252 2253 // check for safepoint operation in progress and/or pending suspend requests 2254 { 2255 Label Continue; 2256 Label slow_path; 2257 2258 __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */); 2259 2260 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2261 __ jcc(Assembler::equal, Continue); 2262 __ bind(slow_path); 2263 2264 // Don't use call_VM as it will see a possible pending exception and forward it 2265 // and never return here preventing us from clearing _last_native_pc down below. 2266 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are 2267 // preserved and correspond to the bcp/locals pointers. So we do a runtime call 2268 // by hand. 2269 // 2270 __ vzeroupper(); 2271 save_native_result(masm, ret_type, stack_slots); 2272 __ mov(c_rarg0, r15_thread); 2273 __ mov(r12, rsp); // remember sp 2274 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2275 __ andptr(rsp, -16); // align stack as required by ABI 2276 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 2277 __ mov(rsp, r12); // restore sp 2278 __ reinit_heapbase(); 2279 // Restore any method result value 2280 restore_native_result(masm, ret_type, stack_slots); 2281 __ bind(Continue); 2282 } 2283 2284 // change thread state 2285 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 2286 __ bind(after_transition); 2287 2288 Label reguard; 2289 Label reguard_done; 2290 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 2291 __ jcc(Assembler::equal, reguard); 2292 __ bind(reguard_done); 2293 2294 // native result if any is live 2295 2296 // Unlock 2297 Label slow_path_unlock; 2298 Label unlock_done; 2299 if (method->is_synchronized()) { 2300 2301 Label fast_done; 2302 2303 // Get locked oop from the handle we passed to jni 2304 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2305 2306 if (LockingMode == LM_LEGACY) { 2307 Label not_recur; 2308 // Simple recursive lock? 2309 __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD); 2310 __ jcc(Assembler::notEqual, not_recur); 2311 __ dec_held_monitor_count(); 2312 __ jmpb(fast_done); 2313 __ bind(not_recur); 2314 } 2315 2316 // Must save rax if it is live now because cmpxchg must use it 2317 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2318 save_native_result(masm, ret_type, stack_slots); 2319 } 2320 2321 if (LockingMode == LM_MONITOR) { 2322 __ jmp(slow_path_unlock); 2323 } else if (LockingMode == LM_LEGACY) { 2324 // get address of the stack lock 2325 __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2326 // get old displaced header 2327 __ movptr(old_hdr, Address(rax, 0)); 2328 2329 // Atomic swap old header if oop still contains the stack lock 2330 __ lock(); 2331 __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2332 __ jcc(Assembler::notEqual, slow_path_unlock); 2333 __ dec_held_monitor_count(); 2334 } else { 2335 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2336 __ movptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2337 __ andptr(swap_reg, ~(int32_t)markWord::lock_mask_in_place); 2338 __ lightweight_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock); 2339 __ dec_held_monitor_count(); 2340 } 2341 2342 // slow path re-enters here 2343 __ bind(unlock_done); 2344 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2345 restore_native_result(masm, ret_type, stack_slots); 2346 } 2347 2348 __ bind(fast_done); 2349 } 2350 { 2351 SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1); 2352 save_native_result(masm, ret_type, stack_slots); 2353 __ mov_metadata(c_rarg1, method()); 2354 __ call_VM_leaf( 2355 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit), 2356 r15_thread, c_rarg1); 2357 restore_native_result(masm, ret_type, stack_slots); 2358 } 2359 2360 __ reset_last_Java_frame(false); 2361 2362 // Unbox oop result, e.g. JNIHandles::resolve value. 2363 if (is_reference_type(ret_type)) { 2364 __ resolve_jobject(rax /* value */, 2365 r15_thread /* thread */, 2366 rcx /* tmp */); 2367 } 2368 2369 if (CheckJNICalls) { 2370 // clear_pending_jni_exception_check 2371 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD); 2372 } 2373 2374 // reset handle block 2375 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset())); 2376 __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD); 2377 2378 // pop our frame 2379 2380 __ leave(); 2381 2382 // Any exception pending? 2383 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2384 __ jcc(Assembler::notEqual, exception_pending); 2385 2386 // Return 2387 2388 __ ret(0); 2389 2390 // Unexpected paths are out of line and go here 2391 2392 // forward the exception 2393 __ bind(exception_pending); 2394 2395 // and forward the exception 2396 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 2397 2398 // Slow path locking & unlocking 2399 if (method->is_synchronized()) { 2400 2401 // BEGIN Slow path lock 2402 __ bind(slow_path_lock); 2403 2404 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM 2405 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2406 2407 // protect the args we've loaded 2408 save_args(masm, total_c_args, c_arg, out_regs); 2409 2410 __ mov(c_rarg0, obj_reg); 2411 __ mov(c_rarg1, lock_reg); 2412 __ mov(c_rarg2, r15_thread); 2413 2414 // Not a leaf but we have last_Java_frame setup as we want 2415 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3); 2416 restore_args(masm, total_c_args, c_arg, out_regs); 2417 2418 #ifdef ASSERT 2419 { Label L; 2420 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2421 __ jcc(Assembler::equal, L); 2422 __ stop("no pending exception allowed on exit from monitorenter"); 2423 __ bind(L); 2424 } 2425 #endif 2426 __ jmp(lock_done); 2427 2428 // END Slow path lock 2429 2430 // BEGIN Slow path unlock 2431 __ bind(slow_path_unlock); 2432 2433 // If we haven't already saved the native result we must save it now as xmm registers 2434 // are still exposed. 2435 __ vzeroupper(); 2436 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2437 save_native_result(masm, ret_type, stack_slots); 2438 } 2439 2440 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2441 2442 __ mov(c_rarg0, obj_reg); 2443 __ mov(c_rarg2, r15_thread); 2444 __ mov(r12, rsp); // remember sp 2445 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2446 __ andptr(rsp, -16); // align stack as required by ABI 2447 2448 // Save pending exception around call to VM (which contains an EXCEPTION_MARK) 2449 // NOTE that obj_reg == rbx currently 2450 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset()))); 2451 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2452 2453 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2454 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C))); 2455 __ mov(rsp, r12); // restore sp 2456 __ reinit_heapbase(); 2457 #ifdef ASSERT 2458 { 2459 Label L; 2460 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2461 __ jcc(Assembler::equal, L); 2462 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C"); 2463 __ bind(L); 2464 } 2465 #endif /* ASSERT */ 2466 2467 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx); 2468 2469 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2470 restore_native_result(masm, ret_type, stack_slots); 2471 } 2472 __ jmp(unlock_done); 2473 2474 // END Slow path unlock 2475 2476 } // synchronized 2477 2478 // SLOW PATH Reguard the stack if needed 2479 2480 __ bind(reguard); 2481 __ vzeroupper(); 2482 save_native_result(masm, ret_type, stack_slots); 2483 __ mov(r12, rsp); // remember sp 2484 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2485 __ andptr(rsp, -16); // align stack as required by ABI 2486 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 2487 __ mov(rsp, r12); // restore sp 2488 __ reinit_heapbase(); 2489 restore_native_result(masm, ret_type, stack_slots); 2490 // and continue 2491 __ jmp(reguard_done); 2492 2493 2494 2495 __ flush(); 2496 2497 nmethod *nm = nmethod::new_native_nmethod(method, 2498 compile_id, 2499 masm->code(), 2500 vep_offset, 2501 frame_complete, 2502 stack_slots / VMRegImpl::slots_per_word, 2503 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)), 2504 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size), 2505 oop_maps); 2506 2507 return nm; 2508 } 2509 2510 // this function returns the adjust size (in number of words) to a c2i adapter 2511 // activation for use during deoptimization 2512 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) { 2513 return (callee_locals - callee_parameters) * Interpreter::stackElementWords; 2514 } 2515 2516 2517 uint SharedRuntime::out_preserve_stack_slots() { 2518 return 0; 2519 } 2520 2521 2522 // Number of stack slots between incoming argument block and the start of 2523 // a new frame. The PROLOG must add this many slots to the stack. The 2524 // EPILOG must remove this many slots. amd64 needs two slots for 2525 // return address. 2526 uint SharedRuntime::in_preserve_stack_slots() { 2527 return 4 + 2 * VerifyStackAtCalls; 2528 } 2529 2530 //------------------------------generate_deopt_blob---------------------------- 2531 void SharedRuntime::generate_deopt_blob() { 2532 // Allocate space for the code 2533 ResourceMark rm; 2534 // Setup code generation tools 2535 int pad = 0; 2536 if (UseAVX > 2) { 2537 pad += 1024; 2538 } 2539 #if INCLUDE_JVMCI 2540 if (EnableJVMCI) { 2541 pad += 512; // Increase the buffer size when compiling for JVMCI 2542 } 2543 #endif 2544 CodeBuffer buffer("deopt_blob", 2560+pad, 1024); 2545 MacroAssembler* masm = new MacroAssembler(&buffer); 2546 int frame_size_in_words; 2547 OopMap* map = nullptr; 2548 OopMapSet *oop_maps = new OopMapSet(); 2549 2550 // ------------- 2551 // This code enters when returning to a de-optimized nmethod. A return 2552 // address has been pushed on the stack, and return values are in 2553 // registers. 2554 // If we are doing a normal deopt then we were called from the patched 2555 // nmethod from the point we returned to the nmethod. So the return 2556 // address on the stack is wrong by NativeCall::instruction_size 2557 // We will adjust the value so it looks like we have the original return 2558 // address on the stack (like when we eagerly deoptimized). 2559 // In the case of an exception pending when deoptimizing, we enter 2560 // with a return address on the stack that points after the call we patched 2561 // into the exception handler. We have the following register state from, 2562 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp). 2563 // rax: exception oop 2564 // rbx: exception handler 2565 // rdx: throwing pc 2566 // So in this case we simply jam rdx into the useless return address and 2567 // the stack looks just like we want. 2568 // 2569 // At this point we need to de-opt. We save the argument return 2570 // registers. We call the first C routine, fetch_unroll_info(). This 2571 // routine captures the return values and returns a structure which 2572 // describes the current frame size and the sizes of all replacement frames. 2573 // The current frame is compiled code and may contain many inlined 2574 // functions, each with their own JVM state. We pop the current frame, then 2575 // push all the new frames. Then we call the C routine unpack_frames() to 2576 // populate these frames. Finally unpack_frames() returns us the new target 2577 // address. Notice that callee-save registers are BLOWN here; they have 2578 // already been captured in the vframeArray at the time the return PC was 2579 // patched. 2580 address start = __ pc(); 2581 Label cont; 2582 2583 // Prolog for non exception case! 2584 2585 // Save everything in sight. 2586 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2587 2588 // Normal deoptimization. Save exec mode for unpack_frames. 2589 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved 2590 __ jmp(cont); 2591 2592 int reexecute_offset = __ pc() - start; 2593 #if INCLUDE_JVMCI && !defined(COMPILER1) 2594 if (EnableJVMCI && UseJVMCICompiler) { 2595 // JVMCI does not use this kind of deoptimization 2596 __ should_not_reach_here(); 2597 } 2598 #endif 2599 2600 // Reexecute case 2601 // return address is the pc describes what bci to do re-execute at 2602 2603 // No need to update map as each call to save_live_registers will produce identical oopmap 2604 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2605 2606 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved 2607 __ jmp(cont); 2608 2609 #if INCLUDE_JVMCI 2610 Label after_fetch_unroll_info_call; 2611 int implicit_exception_uncommon_trap_offset = 0; 2612 int uncommon_trap_offset = 0; 2613 2614 if (EnableJVMCI) { 2615 implicit_exception_uncommon_trap_offset = __ pc() - start; 2616 2617 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset()))); 2618 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD); 2619 2620 uncommon_trap_offset = __ pc() - start; 2621 2622 // Save everything in sight. 2623 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2624 // fetch_unroll_info needs to call last_java_frame() 2625 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2626 2627 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset()))); 2628 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1); 2629 2630 __ movl(r14, Deoptimization::Unpack_reexecute); 2631 __ mov(c_rarg0, r15_thread); 2632 __ movl(c_rarg2, r14); // exec mode 2633 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2634 oop_maps->add_gc_map( __ pc()-start, map->deep_copy()); 2635 2636 __ reset_last_Java_frame(false); 2637 2638 __ jmp(after_fetch_unroll_info_call); 2639 } // EnableJVMCI 2640 #endif // INCLUDE_JVMCI 2641 2642 int exception_offset = __ pc() - start; 2643 2644 // Prolog for exception case 2645 2646 // all registers are dead at this entry point, except for rax, and 2647 // rdx which contain the exception oop and exception pc 2648 // respectively. Set them in TLS and fall thru to the 2649 // unpack_with_exception_in_tls entry point. 2650 2651 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 2652 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax); 2653 2654 int exception_in_tls_offset = __ pc() - start; 2655 2656 // new implementation because exception oop is now passed in JavaThread 2657 2658 // Prolog for exception case 2659 // All registers must be preserved because they might be used by LinearScan 2660 // Exceptiop oop and throwing PC are passed in JavaThread 2661 // tos: stack at point of call to method that threw the exception (i.e. only 2662 // args are on the stack, no return address) 2663 2664 // make room on stack for the return address 2665 // It will be patched later with the throwing pc. The correct value is not 2666 // available now because loading it from memory would destroy registers. 2667 __ push(0); 2668 2669 // Save everything in sight. 2670 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2671 2672 // Now it is safe to overwrite any register 2673 2674 // Deopt during an exception. Save exec mode for unpack_frames. 2675 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved 2676 2677 // load throwing pc from JavaThread and patch it as the return address 2678 // of the current frame. Then clear the field in JavaThread 2679 2680 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2681 __ movptr(Address(rbp, wordSize), rdx); 2682 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2683 2684 #ifdef ASSERT 2685 // verify that there is really an exception oop in JavaThread 2686 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2687 __ verify_oop(rax); 2688 2689 // verify that there is no pending exception 2690 Label no_pending_exception; 2691 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 2692 __ testptr(rax, rax); 2693 __ jcc(Assembler::zero, no_pending_exception); 2694 __ stop("must not have pending exception here"); 2695 __ bind(no_pending_exception); 2696 #endif 2697 2698 __ bind(cont); 2699 2700 // Call C code. Need thread and this frame, but NOT official VM entry 2701 // crud. We cannot block on this call, no GC can happen. 2702 // 2703 // UnrollBlock* fetch_unroll_info(JavaThread* thread) 2704 2705 // fetch_unroll_info needs to call last_java_frame(). 2706 2707 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2708 #ifdef ASSERT 2709 { Label L; 2710 __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD); 2711 __ jcc(Assembler::equal, L); 2712 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared"); 2713 __ bind(L); 2714 } 2715 #endif // ASSERT 2716 __ mov(c_rarg0, r15_thread); 2717 __ movl(c_rarg1, r14); // exec_mode 2718 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info))); 2719 2720 // Need to have an oopmap that tells fetch_unroll_info where to 2721 // find any register it might need. 2722 oop_maps->add_gc_map(__ pc() - start, map); 2723 2724 __ reset_last_Java_frame(false); 2725 2726 #if INCLUDE_JVMCI 2727 if (EnableJVMCI) { 2728 __ bind(after_fetch_unroll_info_call); 2729 } 2730 #endif 2731 2732 // Load UnrollBlock* into rdi 2733 __ mov(rdi, rax); 2734 2735 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset())); 2736 Label noException; 2737 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending? 2738 __ jcc(Assembler::notEqual, noException); 2739 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2740 // QQQ this is useless it was null above 2741 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2742 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD); 2743 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2744 2745 __ verify_oop(rax); 2746 2747 // Overwrite the result registers with the exception results. 2748 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2749 // I think this is useless 2750 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx); 2751 2752 __ bind(noException); 2753 2754 // Only register save data is on the stack. 2755 // Now restore the result registers. Everything else is either dead 2756 // or captured in the vframeArray. 2757 RegisterSaver::restore_result_registers(masm); 2758 2759 // All of the register save area has been popped of the stack. Only the 2760 // return address remains. 2761 2762 // Pop all the frames we must move/replace. 2763 // 2764 // Frame picture (youngest to oldest) 2765 // 1: self-frame (no frame link) 2766 // 2: deopting frame (no frame link) 2767 // 3: caller of deopting frame (could be compiled/interpreted). 2768 // 2769 // Note: by leaving the return address of self-frame on the stack 2770 // and using the size of frame 2 to adjust the stack 2771 // when we are done the return to frame 3 will still be on the stack. 2772 2773 // Pop deoptimized frame 2774 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset())); 2775 __ addptr(rsp, rcx); 2776 2777 // rsp should be pointing at the return address to the caller (3) 2778 2779 // Pick up the initial fp we should save 2780 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2781 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset())); 2782 2783 #ifdef ASSERT 2784 // Compilers generate code that bang the stack by as much as the 2785 // interpreter would need. So this stack banging should never 2786 // trigger a fault. Verify that it does not on non product builds. 2787 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset())); 2788 __ bang_stack_size(rbx, rcx); 2789 #endif 2790 2791 // Load address of array of frame pcs into rcx 2792 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset())); 2793 2794 // Trash the old pc 2795 __ addptr(rsp, wordSize); 2796 2797 // Load address of array of frame sizes into rsi 2798 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset())); 2799 2800 // Load counter into rdx 2801 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset())); 2802 2803 // Now adjust the caller's stack to make up for the extra locals 2804 // but record the original sp so that we can save it in the skeletal interpreter 2805 // frame and the stack walking of interpreter_sender will get the unextended sp 2806 // value and not the "real" sp value. 2807 2808 const Register sender_sp = r8; 2809 2810 __ mov(sender_sp, rsp); 2811 __ movl(rbx, Address(rdi, 2812 Deoptimization::UnrollBlock:: 2813 caller_adjustment_offset())); 2814 __ subptr(rsp, rbx); 2815 2816 // Push interpreter frames in a loop 2817 Label loop; 2818 __ bind(loop); 2819 __ movptr(rbx, Address(rsi, 0)); // Load frame size 2820 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand 2821 __ pushptr(Address(rcx, 0)); // Save return address 2822 __ enter(); // Save old & set new ebp 2823 __ subptr(rsp, rbx); // Prolog 2824 // This value is corrected by layout_activation_impl 2825 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD); 2826 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable 2827 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 2828 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 2829 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 2830 __ decrementl(rdx); // Decrement counter 2831 __ jcc(Assembler::notZero, loop); 2832 __ pushptr(Address(rcx, 0)); // Save final return address 2833 2834 // Re-push self-frame 2835 __ enter(); // Save old & set new ebp 2836 2837 // Allocate a full sized register save area. 2838 // Return address and rbp are in place, so we allocate two less words. 2839 __ subptr(rsp, (frame_size_in_words - 2) * wordSize); 2840 2841 // Restore frame locals after moving the frame 2842 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0); 2843 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2844 2845 // Call C code. Need thread but NOT official VM entry 2846 // crud. We cannot block on this call, no GC can happen. Call should 2847 // restore return values to their stack-slots with the new SP. 2848 // 2849 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode) 2850 2851 // Use rbp because the frames look interpreted now 2852 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 2853 // Don't need the precise return PC here, just precise enough to point into this code blob. 2854 address the_pc = __ pc(); 2855 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1); 2856 2857 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI 2858 __ mov(c_rarg0, r15_thread); 2859 __ movl(c_rarg1, r14); // second arg: exec_mode 2860 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 2861 // Revert SP alignment after call since we're going to do some SP relative addressing below 2862 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset())); 2863 2864 // Set an oopmap for the call site 2865 // Use the same PC we used for the last java frame 2866 oop_maps->add_gc_map(the_pc - start, 2867 new OopMap( frame_size_in_words, 0 )); 2868 2869 // Clear fp AND pc 2870 __ reset_last_Java_frame(true); 2871 2872 // Collect return values 2873 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes())); 2874 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes())); 2875 // I think this is useless (throwing pc?) 2876 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes())); 2877 2878 // Pop self-frame. 2879 __ leave(); // Epilog 2880 2881 // Jump to interpreter 2882 __ ret(0); 2883 2884 // Make sure all code is generated 2885 masm->flush(); 2886 2887 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words); 2888 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset); 2889 #if INCLUDE_JVMCI 2890 if (EnableJVMCI) { 2891 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset); 2892 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset); 2893 } 2894 #endif 2895 } 2896 2897 #ifdef COMPILER2 2898 //------------------------------generate_uncommon_trap_blob-------------------- 2899 void SharedRuntime::generate_uncommon_trap_blob() { 2900 // Allocate space for the code 2901 ResourceMark rm; 2902 // Setup code generation tools 2903 CodeBuffer buffer("uncommon_trap_blob", 2048, 1024); 2904 MacroAssembler* masm = new MacroAssembler(&buffer); 2905 2906 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 2907 2908 address start = __ pc(); 2909 2910 if (UseRTMLocking) { 2911 // Abort RTM transaction before possible nmethod deoptimization. 2912 __ xabort(0); 2913 } 2914 2915 // Push self-frame. We get here with a return address on the 2916 // stack, so rsp is 8-byte aligned until we allocate our frame. 2917 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog! 2918 2919 // No callee saved registers. rbp is assumed implicitly saved 2920 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 2921 2922 // compiler left unloaded_class_index in j_rarg0 move to where the 2923 // runtime expects it. 2924 __ movl(c_rarg1, j_rarg0); 2925 2926 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2927 2928 // Call C code. Need thread but NOT official VM entry 2929 // crud. We cannot block on this call, no GC can happen. Call should 2930 // capture callee-saved registers as well as return values. 2931 // Thread is in rdi already. 2932 // 2933 // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index); 2934 2935 __ mov(c_rarg0, r15_thread); 2936 __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap); 2937 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2938 2939 // Set an oopmap for the call site 2940 OopMapSet* oop_maps = new OopMapSet(); 2941 OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0); 2942 2943 // location of rbp is known implicitly by the frame sender code 2944 2945 oop_maps->add_gc_map(__ pc() - start, map); 2946 2947 __ reset_last_Java_frame(false); 2948 2949 // Load UnrollBlock* into rdi 2950 __ mov(rdi, rax); 2951 2952 #ifdef ASSERT 2953 { Label L; 2954 __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()), 2955 Deoptimization::Unpack_uncommon_trap); 2956 __ jcc(Assembler::equal, L); 2957 __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap"); 2958 __ bind(L); 2959 } 2960 #endif 2961 2962 // Pop all the frames we must move/replace. 2963 // 2964 // Frame picture (youngest to oldest) 2965 // 1: self-frame (no frame link) 2966 // 2: deopting frame (no frame link) 2967 // 3: caller of deopting frame (could be compiled/interpreted). 2968 2969 // Pop self-frame. We have no frame, and must rely only on rax and rsp. 2970 __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog! 2971 2972 // Pop deoptimized frame (int) 2973 __ movl(rcx, Address(rdi, 2974 Deoptimization::UnrollBlock:: 2975 size_of_deoptimized_frame_offset())); 2976 __ addptr(rsp, rcx); 2977 2978 // rsp should be pointing at the return address to the caller (3) 2979 2980 // Pick up the initial fp we should save 2981 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2982 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset())); 2983 2984 #ifdef ASSERT 2985 // Compilers generate code that bang the stack by as much as the 2986 // interpreter would need. So this stack banging should never 2987 // trigger a fault. Verify that it does not on non product builds. 2988 __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset())); 2989 __ bang_stack_size(rbx, rcx); 2990 #endif 2991 2992 // Load address of array of frame pcs into rcx (address*) 2993 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset())); 2994 2995 // Trash the return pc 2996 __ addptr(rsp, wordSize); 2997 2998 // Load address of array of frame sizes into rsi (intptr_t*) 2999 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset())); 3000 3001 // Counter 3002 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset())); // (int) 3003 3004 // Now adjust the caller's stack to make up for the extra locals but 3005 // record the original sp so that we can save it in the skeletal 3006 // interpreter frame and the stack walking of interpreter_sender 3007 // will get the unextended sp value and not the "real" sp value. 3008 3009 const Register sender_sp = r8; 3010 3011 __ mov(sender_sp, rsp); 3012 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset())); // (int) 3013 __ subptr(rsp, rbx); 3014 3015 // Push interpreter frames in a loop 3016 Label loop; 3017 __ bind(loop); 3018 __ movptr(rbx, Address(rsi, 0)); // Load frame size 3019 __ subptr(rbx, 2 * wordSize); // We'll push pc and rbp by hand 3020 __ pushptr(Address(rcx, 0)); // Save return address 3021 __ enter(); // Save old & set new rbp 3022 __ subptr(rsp, rbx); // Prolog 3023 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), 3024 sender_sp); // Make it walkable 3025 // This value is corrected by layout_activation_impl 3026 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD); 3027 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 3028 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 3029 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 3030 __ decrementl(rdx); // Decrement counter 3031 __ jcc(Assembler::notZero, loop); 3032 __ pushptr(Address(rcx, 0)); // Save final return address 3033 3034 // Re-push self-frame 3035 __ enter(); // Save old & set new rbp 3036 __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt); 3037 // Prolog 3038 3039 // Use rbp because the frames look interpreted now 3040 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 3041 // Don't need the precise return PC here, just precise enough to point into this code blob. 3042 address the_pc = __ pc(); 3043 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1); 3044 3045 // Call C code. Need thread but NOT official VM entry 3046 // crud. We cannot block on this call, no GC can happen. Call should 3047 // restore return values to their stack-slots with the new SP. 3048 // Thread is in rdi already. 3049 // 3050 // BasicType unpack_frames(JavaThread* thread, int exec_mode); 3051 3052 __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI 3053 __ mov(c_rarg0, r15_thread); 3054 __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap); 3055 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 3056 3057 // Set an oopmap for the call site 3058 // Use the same PC we used for the last java frame 3059 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 3060 3061 // Clear fp AND pc 3062 __ reset_last_Java_frame(true); 3063 3064 // Pop self-frame. 3065 __ leave(); // Epilog 3066 3067 // Jump to interpreter 3068 __ ret(0); 3069 3070 // Make sure all code is generated 3071 masm->flush(); 3072 3073 _uncommon_trap_blob = UncommonTrapBlob::create(&buffer, oop_maps, 3074 SimpleRuntimeFrame::framesize >> 1); 3075 } 3076 #endif // COMPILER2 3077 3078 //------------------------------generate_handler_blob------ 3079 // 3080 // Generate a special Compile2Runtime blob that saves all registers, 3081 // and setup oopmap. 3082 // 3083 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) { 3084 assert(StubRoutines::forward_exception_entry() != nullptr, 3085 "must be generated before"); 3086 3087 ResourceMark rm; 3088 OopMapSet *oop_maps = new OopMapSet(); 3089 OopMap* map; 3090 3091 // Allocate space for the code. Setup code generation tools. 3092 CodeBuffer buffer("handler_blob", 2048, 1024); 3093 MacroAssembler* masm = new MacroAssembler(&buffer); 3094 3095 address start = __ pc(); 3096 address call_pc = nullptr; 3097 int frame_size_in_words; 3098 bool cause_return = (poll_type == POLL_AT_RETURN); 3099 bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP); 3100 3101 if (UseRTMLocking) { 3102 // Abort RTM transaction before calling runtime 3103 // because critical section will be large and will be 3104 // aborted anyway. Also nmethod could be deoptimized. 3105 __ xabort(0); 3106 } 3107 3108 // Make room for return address (or push it again) 3109 if (!cause_return) { 3110 __ push(rbx); 3111 } 3112 3113 // Save registers, fpu state, and flags 3114 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors); 3115 3116 // The following is basically a call_VM. However, we need the precise 3117 // address of the call in order to generate an oopmap. Hence, we do all the 3118 // work ourselves. 3119 3120 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next: 3121 3122 // The return address must always be correct so that frame constructor never 3123 // sees an invalid pc. 3124 3125 if (!cause_return) { 3126 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack. 3127 // Additionally, rbx is a callee saved register and we can look at it later to determine 3128 // if someone changed the return address for us! 3129 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset())); 3130 __ movptr(Address(rbp, wordSize), rbx); 3131 } 3132 3133 // Do the call 3134 __ mov(c_rarg0, r15_thread); 3135 __ call(RuntimeAddress(call_ptr)); 3136 3137 // Set an oopmap for the call site. This oopmap will map all 3138 // oop-registers and debug-info registers as callee-saved. This 3139 // will allow deoptimization at this safepoint to find all possible 3140 // debug-info recordings, as well as let GC find all oops. 3141 3142 oop_maps->add_gc_map( __ pc() - start, map); 3143 3144 Label noException; 3145 3146 __ reset_last_Java_frame(false); 3147 3148 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3149 __ jcc(Assembler::equal, noException); 3150 3151 // Exception pending 3152 3153 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3154 3155 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3156 3157 // No exception case 3158 __ bind(noException); 3159 3160 Label no_adjust; 3161 #ifdef ASSERT 3162 Label bail; 3163 #endif 3164 if (!cause_return) { 3165 Label no_prefix, not_special; 3166 3167 // If our stashed return pc was modified by the runtime we avoid touching it 3168 __ cmpptr(rbx, Address(rbp, wordSize)); 3169 __ jccb(Assembler::notEqual, no_adjust); 3170 3171 // Skip over the poll instruction. 3172 // See NativeInstruction::is_safepoint_poll() 3173 // Possible encodings: 3174 // 85 00 test %eax,(%rax) 3175 // 85 01 test %eax,(%rcx) 3176 // 85 02 test %eax,(%rdx) 3177 // 85 03 test %eax,(%rbx) 3178 // 85 06 test %eax,(%rsi) 3179 // 85 07 test %eax,(%rdi) 3180 // 3181 // 41 85 00 test %eax,(%r8) 3182 // 41 85 01 test %eax,(%r9) 3183 // 41 85 02 test %eax,(%r10) 3184 // 41 85 03 test %eax,(%r11) 3185 // 41 85 06 test %eax,(%r14) 3186 // 41 85 07 test %eax,(%r15) 3187 // 3188 // 85 04 24 test %eax,(%rsp) 3189 // 41 85 04 24 test %eax,(%r12) 3190 // 85 45 00 test %eax,0x0(%rbp) 3191 // 41 85 45 00 test %eax,0x0(%r13) 3192 3193 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix); 3194 __ jcc(Assembler::notEqual, no_prefix); 3195 __ addptr(rbx, 1); 3196 __ bind(no_prefix); 3197 #ifdef ASSERT 3198 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below 3199 #endif 3200 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values: 3201 // r12/rsp 0x04 3202 // r13/rbp 0x05 3203 __ movzbq(rcx, Address(rbx, 1)); 3204 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05 3205 __ subptr(rcx, 4); // looking for 0x00 .. 0x01 3206 __ cmpptr(rcx, 1); 3207 __ jcc(Assembler::above, not_special); 3208 __ addptr(rbx, 1); 3209 __ bind(not_special); 3210 #ifdef ASSERT 3211 // Verify the correct encoding of the poll we're about to skip. 3212 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl); 3213 __ jcc(Assembler::notEqual, bail); 3214 // Mask out the modrm bits 3215 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask); 3216 // rax encodes to 0, so if the bits are nonzero it's incorrect 3217 __ jcc(Assembler::notZero, bail); 3218 #endif 3219 // Adjust return pc forward to step over the safepoint poll instruction 3220 __ addptr(rbx, 2); 3221 __ movptr(Address(rbp, wordSize), rbx); 3222 } 3223 3224 __ bind(no_adjust); 3225 // Normal exit, restore registers and exit. 3226 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3227 __ ret(0); 3228 3229 #ifdef ASSERT 3230 __ bind(bail); 3231 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected"); 3232 #endif 3233 3234 // Make sure all code is generated 3235 masm->flush(); 3236 3237 // Fill-out other meta info 3238 return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words); 3239 } 3240 3241 // 3242 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss 3243 // 3244 // Generate a stub that calls into vm to find out the proper destination 3245 // of a java call. All the argument registers are live at this point 3246 // but since this is generic code we don't know what they are and the caller 3247 // must do any gc of the args. 3248 // 3249 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) { 3250 assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before"); 3251 3252 // allocate space for the code 3253 ResourceMark rm; 3254 3255 CodeBuffer buffer(name, 1200, 512); 3256 MacroAssembler* masm = new MacroAssembler(&buffer); 3257 3258 int frame_size_in_words; 3259 3260 OopMapSet *oop_maps = new OopMapSet(); 3261 OopMap* map = nullptr; 3262 3263 int start = __ offset(); 3264 3265 // No need to save vector registers since they are caller-saved anyway. 3266 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false); 3267 3268 int frame_complete = __ offset(); 3269 3270 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 3271 3272 __ mov(c_rarg0, r15_thread); 3273 3274 __ call(RuntimeAddress(destination)); 3275 3276 3277 // Set an oopmap for the call site. 3278 // We need this not only for callee-saved registers, but also for volatile 3279 // registers that the compiler might be keeping live across a safepoint. 3280 3281 oop_maps->add_gc_map( __ offset() - start, map); 3282 3283 // rax contains the address we are going to jump to assuming no exception got installed 3284 3285 // clear last_Java_sp 3286 __ reset_last_Java_frame(false); 3287 // check for pending exceptions 3288 Label pending; 3289 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3290 __ jcc(Assembler::notEqual, pending); 3291 3292 // get the returned Method* 3293 __ get_vm_result_2(rbx, r15_thread); 3294 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx); 3295 3296 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 3297 3298 RegisterSaver::restore_live_registers(masm); 3299 3300 // We are back to the original state on entry and ready to go. 3301 3302 __ jmp(rax); 3303 3304 // Pending exception after the safepoint 3305 3306 __ bind(pending); 3307 3308 RegisterSaver::restore_live_registers(masm); 3309 3310 // exception pending => remove activation and forward to exception handler 3311 3312 __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD); 3313 3314 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 3315 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3316 3317 // ------------- 3318 // make sure all code is generated 3319 masm->flush(); 3320 3321 // return the blob 3322 // frame_size_words or bytes?? 3323 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); 3324 } 3325 3326 //------------------------------Montgomery multiplication------------------------ 3327 // 3328 3329 #ifndef _WINDOWS 3330 3331 // Subtract 0:b from carry:a. Return carry. 3332 static julong 3333 sub(julong a[], julong b[], julong carry, long len) { 3334 long long i = 0, cnt = len; 3335 julong tmp; 3336 asm volatile("clc; " 3337 "0: ; " 3338 "mov (%[b], %[i], 8), %[tmp]; " 3339 "sbb %[tmp], (%[a], %[i], 8); " 3340 "inc %[i]; dec %[cnt]; " 3341 "jne 0b; " 3342 "mov %[carry], %[tmp]; sbb $0, %[tmp]; " 3343 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp) 3344 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry) 3345 : "memory"); 3346 return tmp; 3347 } 3348 3349 // Multiply (unsigned) Long A by Long B, accumulating the double- 3350 // length result into the accumulator formed of T0, T1, and T2. 3351 #define MACC(A, B, T0, T1, T2) \ 3352 do { \ 3353 unsigned long hi, lo; \ 3354 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3355 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3356 : "r"(A), "a"(B) : "cc"); \ 3357 } while(0) 3358 3359 // As above, but add twice the double-length result into the 3360 // accumulator. 3361 #define MACC2(A, B, T0, T1, T2) \ 3362 do { \ 3363 unsigned long hi, lo; \ 3364 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \ 3365 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3366 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3367 : "r"(A), "a"(B) : "cc"); \ 3368 } while(0) 3369 3370 #else //_WINDOWS 3371 3372 static julong 3373 sub(julong a[], julong b[], julong carry, long len) { 3374 long i; 3375 julong tmp; 3376 unsigned char c = 1; 3377 for (i = 0; i < len; i++) { 3378 c = _addcarry_u64(c, a[i], ~b[i], &tmp); 3379 a[i] = tmp; 3380 } 3381 c = _addcarry_u64(c, carry, ~0, &tmp); 3382 return tmp; 3383 } 3384 3385 // Multiply (unsigned) Long A by Long B, accumulating the double- 3386 // length result into the accumulator formed of T0, T1, and T2. 3387 #define MACC(A, B, T0, T1, T2) \ 3388 do { \ 3389 julong hi, lo; \ 3390 lo = _umul128(A, B, &hi); \ 3391 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3392 c = _addcarry_u64(c, hi, T1, &T1); \ 3393 _addcarry_u64(c, T2, 0, &T2); \ 3394 } while(0) 3395 3396 // As above, but add twice the double-length result into the 3397 // accumulator. 3398 #define MACC2(A, B, T0, T1, T2) \ 3399 do { \ 3400 julong hi, lo; \ 3401 lo = _umul128(A, B, &hi); \ 3402 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3403 c = _addcarry_u64(c, hi, T1, &T1); \ 3404 _addcarry_u64(c, T2, 0, &T2); \ 3405 c = _addcarry_u64(0, lo, T0, &T0); \ 3406 c = _addcarry_u64(c, hi, T1, &T1); \ 3407 _addcarry_u64(c, T2, 0, &T2); \ 3408 } while(0) 3409 3410 #endif //_WINDOWS 3411 3412 // Fast Montgomery multiplication. The derivation of the algorithm is 3413 // in A Cryptographic Library for the Motorola DSP56000, 3414 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3415 3416 static void NOINLINE 3417 montgomery_multiply(julong a[], julong b[], julong n[], 3418 julong m[], julong inv, int len) { 3419 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3420 int i; 3421 3422 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply"); 3423 3424 for (i = 0; i < len; i++) { 3425 int j; 3426 for (j = 0; j < i; j++) { 3427 MACC(a[j], b[i-j], t0, t1, t2); 3428 MACC(m[j], n[i-j], t0, t1, t2); 3429 } 3430 MACC(a[i], b[0], t0, t1, t2); 3431 m[i] = t0 * inv; 3432 MACC(m[i], n[0], t0, t1, t2); 3433 3434 assert(t0 == 0, "broken Montgomery multiply"); 3435 3436 t0 = t1; t1 = t2; t2 = 0; 3437 } 3438 3439 for (i = len; i < 2*len; i++) { 3440 int j; 3441 for (j = i-len+1; j < len; j++) { 3442 MACC(a[j], b[i-j], t0, t1, t2); 3443 MACC(m[j], n[i-j], t0, t1, t2); 3444 } 3445 m[i-len] = t0; 3446 t0 = t1; t1 = t2; t2 = 0; 3447 } 3448 3449 while (t0) 3450 t0 = sub(m, n, t0, len); 3451 } 3452 3453 // Fast Montgomery squaring. This uses asymptotically 25% fewer 3454 // multiplies so it should be up to 25% faster than Montgomery 3455 // multiplication. However, its loop control is more complex and it 3456 // may actually run slower on some machines. 3457 3458 static void NOINLINE 3459 montgomery_square(julong a[], julong n[], 3460 julong m[], julong inv, int len) { 3461 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3462 int i; 3463 3464 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square"); 3465 3466 for (i = 0; i < len; i++) { 3467 int j; 3468 int end = (i+1)/2; 3469 for (j = 0; j < end; j++) { 3470 MACC2(a[j], a[i-j], t0, t1, t2); 3471 MACC(m[j], n[i-j], t0, t1, t2); 3472 } 3473 if ((i & 1) == 0) { 3474 MACC(a[j], a[j], t0, t1, t2); 3475 } 3476 for (; j < i; j++) { 3477 MACC(m[j], n[i-j], t0, t1, t2); 3478 } 3479 m[i] = t0 * inv; 3480 MACC(m[i], n[0], t0, t1, t2); 3481 3482 assert(t0 == 0, "broken Montgomery square"); 3483 3484 t0 = t1; t1 = t2; t2 = 0; 3485 } 3486 3487 for (i = len; i < 2*len; i++) { 3488 int start = i-len+1; 3489 int end = start + (len - start)/2; 3490 int j; 3491 for (j = start; j < end; j++) { 3492 MACC2(a[j], a[i-j], t0, t1, t2); 3493 MACC(m[j], n[i-j], t0, t1, t2); 3494 } 3495 if ((i & 1) == 0) { 3496 MACC(a[j], a[j], t0, t1, t2); 3497 } 3498 for (; j < len; j++) { 3499 MACC(m[j], n[i-j], t0, t1, t2); 3500 } 3501 m[i-len] = t0; 3502 t0 = t1; t1 = t2; t2 = 0; 3503 } 3504 3505 while (t0) 3506 t0 = sub(m, n, t0, len); 3507 } 3508 3509 // Swap words in a longword. 3510 static julong swap(julong x) { 3511 return (x << 32) | (x >> 32); 3512 } 3513 3514 // Copy len longwords from s to d, word-swapping as we go. The 3515 // destination array is reversed. 3516 static void reverse_words(julong *s, julong *d, int len) { 3517 d += len; 3518 while(len-- > 0) { 3519 d--; 3520 *d = swap(*s); 3521 s++; 3522 } 3523 } 3524 3525 // The threshold at which squaring is advantageous was determined 3526 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. 3527 #define MONTGOMERY_SQUARING_THRESHOLD 64 3528 3529 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, 3530 jint len, jlong inv, 3531 jint *m_ints) { 3532 assert(len % 2 == 0, "array length in montgomery_multiply must be even"); 3533 int longwords = len/2; 3534 3535 // Make very sure we don't use so much space that the stack might 3536 // overflow. 512 jints corresponds to an 16384-bit integer and 3537 // will use here a total of 8k bytes of stack space. 3538 int divisor = sizeof(julong) * 4; 3539 guarantee(longwords <= 8192 / divisor, "must be"); 3540 int total_allocation = longwords * sizeof (julong) * 4; 3541 julong *scratch = (julong *)alloca(total_allocation); 3542 3543 // Local scratch arrays 3544 julong 3545 *a = scratch + 0 * longwords, 3546 *b = scratch + 1 * longwords, 3547 *n = scratch + 2 * longwords, 3548 *m = scratch + 3 * longwords; 3549 3550 reverse_words((julong *)a_ints, a, longwords); 3551 reverse_words((julong *)b_ints, b, longwords); 3552 reverse_words((julong *)n_ints, n, longwords); 3553 3554 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords); 3555 3556 reverse_words(m, (julong *)m_ints, longwords); 3557 } 3558 3559 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, 3560 jint len, jlong inv, 3561 jint *m_ints) { 3562 assert(len % 2 == 0, "array length in montgomery_square must be even"); 3563 int longwords = len/2; 3564 3565 // Make very sure we don't use so much space that the stack might 3566 // overflow. 512 jints corresponds to an 16384-bit integer and 3567 // will use here a total of 6k bytes of stack space. 3568 int divisor = sizeof(julong) * 3; 3569 guarantee(longwords <= (8192 / divisor), "must be"); 3570 int total_allocation = longwords * sizeof (julong) * 3; 3571 julong *scratch = (julong *)alloca(total_allocation); 3572 3573 // Local scratch arrays 3574 julong 3575 *a = scratch + 0 * longwords, 3576 *n = scratch + 1 * longwords, 3577 *m = scratch + 2 * longwords; 3578 3579 reverse_words((julong *)a_ints, a, longwords); 3580 reverse_words((julong *)n_ints, n, longwords); 3581 3582 if (len >= MONTGOMERY_SQUARING_THRESHOLD) { 3583 ::montgomery_square(a, n, m, (julong)inv, longwords); 3584 } else { 3585 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords); 3586 } 3587 3588 reverse_words(m, (julong *)m_ints, longwords); 3589 } 3590 3591 #ifdef COMPILER2 3592 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame 3593 // 3594 //------------------------------generate_exception_blob--------------------------- 3595 // creates exception blob at the end 3596 // Using exception blob, this code is jumped from a compiled method. 3597 // (see emit_exception_handler in x86_64.ad file) 3598 // 3599 // Given an exception pc at a call we call into the runtime for the 3600 // handler in this method. This handler might merely restore state 3601 // (i.e. callee save registers) unwind the frame and jump to the 3602 // exception handler for the nmethod if there is no Java level handler 3603 // for the nmethod. 3604 // 3605 // This code is entered with a jmp. 3606 // 3607 // Arguments: 3608 // rax: exception oop 3609 // rdx: exception pc 3610 // 3611 // Results: 3612 // rax: exception oop 3613 // rdx: exception pc in caller or ??? 3614 // destination: exception handler of caller 3615 // 3616 // Note: the exception pc MUST be at a call (precise debug information) 3617 // Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved. 3618 // 3619 3620 void OptoRuntime::generate_exception_blob() { 3621 assert(!OptoRuntime::is_callee_saved_register(RDX_num), ""); 3622 assert(!OptoRuntime::is_callee_saved_register(RAX_num), ""); 3623 assert(!OptoRuntime::is_callee_saved_register(RCX_num), ""); 3624 3625 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 3626 3627 // Allocate space for the code 3628 ResourceMark rm; 3629 // Setup code generation tools 3630 CodeBuffer buffer("exception_blob", 2048, 1024); 3631 MacroAssembler* masm = new MacroAssembler(&buffer); 3632 3633 3634 address start = __ pc(); 3635 3636 // Exception pc is 'return address' for stack walker 3637 __ push(rdx); 3638 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog 3639 3640 // Save callee-saved registers. See x86_64.ad. 3641 3642 // rbp is an implicitly saved callee saved register (i.e., the calling 3643 // convention will save/restore it in the prolog/epilog). Other than that 3644 // there are no callee save registers now that adapter frames are gone. 3645 3646 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 3647 3648 // Store exception in Thread object. We cannot pass any arguments to the 3649 // handle_exception call, since we do not want to make any assumption 3650 // about the size of the frame where the exception happened in. 3651 // c_rarg0 is either rdi (Linux) or rcx (Windows). 3652 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax); 3653 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 3654 3655 // This call does all the hard work. It checks if an exception handler 3656 // exists in the method. 3657 // If so, it returns the handler address. 3658 // If not, it prepares for stack-unwinding, restoring the callee-save 3659 // registers of the frame being removed. 3660 // 3661 // address OptoRuntime::handle_exception_C(JavaThread* thread) 3662 3663 // At a method handle call, the stack may not be properly aligned 3664 // when returning with an exception. 3665 address the_pc = __ pc(); 3666 __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1); 3667 __ mov(c_rarg0, r15_thread); 3668 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack 3669 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C))); 3670 3671 // Set an oopmap for the call site. This oopmap will only be used if we 3672 // are unwinding the stack. Hence, all locations will be dead. 3673 // Callee-saved registers will be the same as the frame above (i.e., 3674 // handle_exception_stub), since they were restored when we got the 3675 // exception. 3676 3677 OopMapSet* oop_maps = new OopMapSet(); 3678 3679 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 3680 3681 __ reset_last_Java_frame(false); 3682 3683 // Restore callee-saved registers 3684 3685 // rbp is an implicitly saved callee-saved register (i.e., the calling 3686 // convention will save restore it in prolog/epilog) Other than that 3687 // there are no callee save registers now that adapter frames are gone. 3688 3689 __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt)); 3690 3691 __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog 3692 __ pop(rdx); // No need for exception pc anymore 3693 3694 // rax: exception handler 3695 3696 // We have a handler in rax (could be deopt blob). 3697 __ mov(r8, rax); 3698 3699 // Get the exception oop 3700 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 3701 // Get the exception pc in case we are deoptimized 3702 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 3703 #ifdef ASSERT 3704 __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD); 3705 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 3706 #endif 3707 // Clear the exception oop so GC no longer processes it as a root. 3708 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD); 3709 3710 // rax: exception oop 3711 // r8: exception handler 3712 // rdx: exception pc 3713 // Jump to handler 3714 3715 __ jmp(r8); 3716 3717 // Make sure all code is generated 3718 masm->flush(); 3719 3720 // Set exception blob 3721 _exception_blob = ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1); 3722 } 3723 #endif // COMPILER2 3724