1 /* 2 * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #ifndef _WINDOWS 27 #include "alloca.h" 28 #endif 29 #include "asm/macroAssembler.hpp" 30 #include "asm/macroAssembler.inline.hpp" 31 #include "code/compiledIC.hpp" 32 #include "code/debugInfoRec.hpp" 33 #include "code/icBuffer.hpp" 34 #include "code/nativeInst.hpp" 35 #include "code/vtableStubs.hpp" 36 #include "compiler/oopMap.hpp" 37 #include "gc/shared/collectedHeap.hpp" 38 #include "gc/shared/gcLocker.hpp" 39 #include "gc/shared/barrierSet.hpp" 40 #include "gc/shared/barrierSetAssembler.hpp" 41 #include "interpreter/interpreter.hpp" 42 #include "logging/log.hpp" 43 #include "memory/resourceArea.hpp" 44 #include "memory/universe.hpp" 45 #include "oops/compiledICHolder.hpp" 46 #include "oops/klass.inline.hpp" 47 #include "oops/method.inline.hpp" 48 #include "prims/methodHandles.hpp" 49 #include "runtime/continuation.hpp" 50 #include "runtime/continuationEntry.inline.hpp" 51 #include "runtime/globals.hpp" 52 #include "runtime/jniHandles.hpp" 53 #include "runtime/safepointMechanism.hpp" 54 #include "runtime/sharedRuntime.hpp" 55 #include "runtime/signature.hpp" 56 #include "runtime/stubRoutines.hpp" 57 #include "runtime/vframeArray.hpp" 58 #include "runtime/vm_version.hpp" 59 #include "utilities/align.hpp" 60 #include "utilities/formatBuffer.hpp" 61 #include "vmreg_x86.inline.hpp" 62 #ifdef COMPILER1 63 #include "c1/c1_Runtime1.hpp" 64 #endif 65 #ifdef COMPILER2 66 #include "opto/runtime.hpp" 67 #endif 68 #if INCLUDE_JVMCI 69 #include "jvmci/jvmciJavaClasses.hpp" 70 #endif 71 72 #define __ masm-> 73 74 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size; 75 76 class SimpleRuntimeFrame { 77 78 public: 79 80 // Most of the runtime stubs have this simple frame layout. 81 // This class exists to make the layout shared in one place. 82 // Offsets are for compiler stack slots, which are jints. 83 enum layout { 84 // The frame sender code expects that rbp will be in the "natural" place and 85 // will override any oopMap setting for it. We must therefore force the layout 86 // so that it agrees with the frame sender code. 87 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt, 88 rbp_off2, 89 return_off, return_off2, 90 framesize 91 }; 92 }; 93 94 class RegisterSaver { 95 // Capture info about frame layout. Layout offsets are in jint 96 // units because compiler frame slots are jints. 97 #define XSAVE_AREA_BEGIN 160 98 #define XSAVE_AREA_YMM_BEGIN 576 99 #define XSAVE_AREA_OPMASK_BEGIN 1088 100 #define XSAVE_AREA_ZMM_BEGIN 1152 101 #define XSAVE_AREA_UPPERBANK 1664 102 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off 103 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off 104 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off 105 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off 106 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off 107 enum layout { 108 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area 109 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area 110 DEF_XMM_OFFS(0), 111 DEF_XMM_OFFS(1), 112 // 2..15 are implied in range usage 113 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 114 DEF_YMM_OFFS(0), 115 DEF_YMM_OFFS(1), 116 // 2..15 are implied in range usage 117 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 118 DEF_OPMASK_OFFS(0), 119 DEF_OPMASK_OFFS(1), 120 // 2..7 are implied in range usage 121 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 122 DEF_ZMM_OFFS(0), 123 DEF_ZMM_OFFS(1), 124 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt, 125 DEF_ZMM_UPPER_OFFS(16), 126 DEF_ZMM_UPPER_OFFS(17), 127 // 18..31 are implied in range usage 128 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt), 129 fpu_stateH_end, 130 r15_off, r15H_off, 131 r14_off, r14H_off, 132 r13_off, r13H_off, 133 r12_off, r12H_off, 134 r11_off, r11H_off, 135 r10_off, r10H_off, 136 r9_off, r9H_off, 137 r8_off, r8H_off, 138 rdi_off, rdiH_off, 139 rsi_off, rsiH_off, 140 ignore_off, ignoreH_off, // extra copy of rbp 141 rsp_off, rspH_off, 142 rbx_off, rbxH_off, 143 rdx_off, rdxH_off, 144 rcx_off, rcxH_off, 145 rax_off, raxH_off, 146 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state 147 align_off, alignH_off, 148 flags_off, flagsH_off, 149 // The frame sender code expects that rbp will be in the "natural" place and 150 // will override any oopMap setting for it. We must therefore force the layout 151 // so that it agrees with the frame sender code. 152 rbp_off, rbpH_off, // copy of rbp we will restore 153 return_off, returnH_off, // slot for return address 154 reg_save_size // size in compiler stack slots 155 }; 156 157 public: 158 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors); 159 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false); 160 161 // Offsets into the register save area 162 // Used by deoptimization when it is managing result register 163 // values on its own 164 165 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; } 166 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; } 167 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; } 168 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; } 169 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; } 170 171 // During deoptimization only the result registers need to be restored, 172 // all the other values have already been extracted. 173 static void restore_result_registers(MacroAssembler* masm); 174 }; 175 176 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) { 177 int off = 0; 178 int num_xmm_regs = XMMRegister::available_xmm_registers(); 179 #if COMPILER2_OR_JVMCI 180 if (save_wide_vectors && UseAVX == 0) { 181 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX 182 } 183 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 184 #else 185 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI 186 #endif 187 188 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated 189 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs); 190 // OopMap frame size is in compiler stack slots (jint's) not bytes or words 191 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 192 // CodeBlob frame size is in words. 193 int frame_size_in_words = frame_size_in_bytes / wordSize; 194 *total_frame_words = frame_size_in_words; 195 196 // Save registers, fpu state, and flags. 197 // We assume caller has already pushed the return address onto the 198 // stack, so rsp is 8-byte aligned here. 199 // We push rpb twice in this sequence because we want the real rbp 200 // to be under the return like a normal enter. 201 202 __ enter(); // rsp becomes 16-byte aligned here 203 __ push_CPU_state(); // Push a multiple of 16 bytes 204 205 // push cpu state handles this on EVEX enabled targets 206 if (save_wide_vectors) { 207 // Save upper half of YMM registers(0..15) 208 int base_addr = XSAVE_AREA_YMM_BEGIN; 209 for (int n = 0; n < 16; n++) { 210 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n)); 211 } 212 if (VM_Version::supports_evex()) { 213 // Save upper half of ZMM registers(0..15) 214 base_addr = XSAVE_AREA_ZMM_BEGIN; 215 for (int n = 0; n < 16; n++) { 216 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n)); 217 } 218 // Save full ZMM registers(16..num_xmm_regs) 219 base_addr = XSAVE_AREA_UPPERBANK; 220 off = 0; 221 int vector_len = Assembler::AVX_512bit; 222 for (int n = 16; n < num_xmm_regs; n++) { 223 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 224 } 225 #if COMPILER2_OR_JVMCI 226 base_addr = XSAVE_AREA_OPMASK_BEGIN; 227 off = 0; 228 for(int n = 0; n < KRegister::number_of_registers; n++) { 229 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 230 } 231 #endif 232 } 233 } else { 234 if (VM_Version::supports_evex()) { 235 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 236 int base_addr = XSAVE_AREA_UPPERBANK; 237 off = 0; 238 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 239 for (int n = 16; n < num_xmm_regs; n++) { 240 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 241 } 242 #if COMPILER2_OR_JVMCI 243 base_addr = XSAVE_AREA_OPMASK_BEGIN; 244 off = 0; 245 for(int n = 0; n < KRegister::number_of_registers; n++) { 246 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 247 } 248 #endif 249 } 250 } 251 __ vzeroupper(); 252 if (frame::arg_reg_save_area_bytes != 0) { 253 // Allocate argument register save area 254 __ subptr(rsp, frame::arg_reg_save_area_bytes); 255 } 256 257 // Set an oopmap for the call site. This oopmap will map all 258 // oop-registers and debug-info registers as callee-saved. This 259 // will allow deoptimization at this safepoint to find all possible 260 // debug-info recordings, as well as let GC find all oops. 261 262 OopMapSet *oop_maps = new OopMapSet(); 263 OopMap* map = new OopMap(frame_size_in_slots, 0); 264 265 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x)) 266 267 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg()); 268 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg()); 269 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg()); 270 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg()); 271 // rbp location is known implicitly by the frame sender code, needs no oopmap 272 // and the location where rbp was saved by is ignored 273 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg()); 274 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg()); 275 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg()); 276 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg()); 277 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg()); 278 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg()); 279 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg()); 280 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg()); 281 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg()); 282 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg()); 283 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 284 // on EVEX enabled targets, we get it included in the xsave area 285 off = xmm0_off; 286 int delta = xmm1_off - off; 287 for (int n = 0; n < 16; n++) { 288 XMMRegister xmm_name = as_XMMRegister(n); 289 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()); 290 off += delta; 291 } 292 if (UseAVX > 2) { 293 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 294 off = zmm16_off; 295 delta = zmm17_off - off; 296 for (int n = 16; n < num_xmm_regs; n++) { 297 XMMRegister zmm_name = as_XMMRegister(n); 298 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()); 299 off += delta; 300 } 301 } 302 303 #if COMPILER2_OR_JVMCI 304 if (save_wide_vectors) { 305 // Save upper half of YMM registers(0..15) 306 off = ymm0_off; 307 delta = ymm1_off - ymm0_off; 308 for (int n = 0; n < 16; n++) { 309 XMMRegister ymm_name = as_XMMRegister(n); 310 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4)); 311 off += delta; 312 } 313 if (VM_Version::supports_evex()) { 314 // Save upper half of ZMM registers(0..15) 315 off = zmm0_off; 316 delta = zmm1_off - zmm0_off; 317 for (int n = 0; n < 16; n++) { 318 XMMRegister zmm_name = as_XMMRegister(n); 319 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8)); 320 off += delta; 321 } 322 } 323 } 324 #endif // COMPILER2_OR_JVMCI 325 326 // %%% These should all be a waste but we'll keep things as they were for now 327 if (true) { 328 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next()); 329 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next()); 330 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next()); 331 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next()); 332 // rbp location is known implicitly by the frame sender code, needs no oopmap 333 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next()); 334 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next()); 335 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next()); 336 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next()); 337 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next()); 338 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next()); 339 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next()); 340 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next()); 341 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next()); 342 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next()); 343 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 344 // on EVEX enabled targets, we get it included in the xsave area 345 off = xmm0H_off; 346 delta = xmm1H_off - off; 347 for (int n = 0; n < 16; n++) { 348 XMMRegister xmm_name = as_XMMRegister(n); 349 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next()); 350 off += delta; 351 } 352 if (UseAVX > 2) { 353 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 354 off = zmm16H_off; 355 delta = zmm17H_off - off; 356 for (int n = 16; n < num_xmm_regs; n++) { 357 XMMRegister zmm_name = as_XMMRegister(n); 358 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next()); 359 off += delta; 360 } 361 } 362 } 363 364 return map; 365 } 366 367 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) { 368 int num_xmm_regs = XMMRegister::available_xmm_registers(); 369 if (frame::arg_reg_save_area_bytes != 0) { 370 // Pop arg register save area 371 __ addptr(rsp, frame::arg_reg_save_area_bytes); 372 } 373 374 #if COMPILER2_OR_JVMCI 375 if (restore_wide_vectors) { 376 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); 377 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 378 } 379 #else 380 assert(!restore_wide_vectors, "vectors are generated only by C2"); 381 #endif 382 383 __ vzeroupper(); 384 385 // On EVEX enabled targets everything is handled in pop fpu state 386 if (restore_wide_vectors) { 387 // Restore upper half of YMM registers (0..15) 388 int base_addr = XSAVE_AREA_YMM_BEGIN; 389 for (int n = 0; n < 16; n++) { 390 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16)); 391 } 392 if (VM_Version::supports_evex()) { 393 // Restore upper half of ZMM registers (0..15) 394 base_addr = XSAVE_AREA_ZMM_BEGIN; 395 for (int n = 0; n < 16; n++) { 396 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32)); 397 } 398 // Restore full ZMM registers(16..num_xmm_regs) 399 base_addr = XSAVE_AREA_UPPERBANK; 400 int vector_len = Assembler::AVX_512bit; 401 int off = 0; 402 for (int n = 16; n < num_xmm_regs; n++) { 403 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 404 } 405 #if COMPILER2_OR_JVMCI 406 base_addr = XSAVE_AREA_OPMASK_BEGIN; 407 off = 0; 408 for (int n = 0; n < KRegister::number_of_registers; n++) { 409 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 410 } 411 #endif 412 } 413 } else { 414 if (VM_Version::supports_evex()) { 415 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 416 int base_addr = XSAVE_AREA_UPPERBANK; 417 int off = 0; 418 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 419 for (int n = 16; n < num_xmm_regs; n++) { 420 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 421 } 422 #if COMPILER2_OR_JVMCI 423 base_addr = XSAVE_AREA_OPMASK_BEGIN; 424 off = 0; 425 for (int n = 0; n < KRegister::number_of_registers; n++) { 426 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 427 } 428 #endif 429 } 430 } 431 432 // Recover CPU state 433 __ pop_CPU_state(); 434 // Get the rbp described implicitly by the calling convention (no oopMap) 435 __ pop(rbp); 436 } 437 438 void RegisterSaver::restore_result_registers(MacroAssembler* masm) { 439 440 // Just restore result register. Only used by deoptimization. By 441 // now any callee save register that needs to be restored to a c2 442 // caller of the deoptee has been extracted into the vframeArray 443 // and will be stuffed into the c2i adapter we create for later 444 // restoration so only result registers need to be restored here. 445 446 // Restore fp result register 447 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes())); 448 // Restore integer result register 449 __ movptr(rax, Address(rsp, rax_offset_in_bytes())); 450 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes())); 451 452 // Pop all of the register save are off the stack except the return address 453 __ addptr(rsp, return_offset_in_bytes()); 454 } 455 456 // Is vector's size (in bytes) bigger than a size saved by default? 457 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions. 458 bool SharedRuntime::is_wide_vector(int size) { 459 return size > 16; 460 } 461 462 // --------------------------------------------------------------------------- 463 // Read the array of BasicTypes from a signature, and compute where the 464 // arguments should go. Values in the VMRegPair regs array refer to 4-byte 465 // quantities. Values less than VMRegImpl::stack0 are registers, those above 466 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer 467 // as framesizes are fixed. 468 // VMRegImpl::stack0 refers to the first slot 0(sp). 469 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher. 470 // Register up to Register::number_of_registers are the 64-bit 471 // integer registers. 472 473 // Note: the INPUTS in sig_bt are in units of Java argument words, which are 474 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit 475 // units regardless of build. Of course for i486 there is no 64 bit build 476 477 // The Java calling convention is a "shifted" version of the C ABI. 478 // By skipping the first C ABI register we can call non-static jni methods 479 // with small numbers of arguments without having to shuffle the arguments 480 // at all. Since we control the java ABI we ought to at least get some 481 // advantage out of it. 482 483 int SharedRuntime::java_calling_convention(const BasicType *sig_bt, 484 VMRegPair *regs, 485 int total_args_passed) { 486 487 // Create the mapping between argument positions and 488 // registers. 489 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = { 490 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5 491 }; 492 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = { 493 j_farg0, j_farg1, j_farg2, j_farg3, 494 j_farg4, j_farg5, j_farg6, j_farg7 495 }; 496 497 498 uint int_args = 0; 499 uint fp_args = 0; 500 uint stk_args = 0; // inc by 2 each time 501 502 for (int i = 0; i < total_args_passed; i++) { 503 switch (sig_bt[i]) { 504 case T_BOOLEAN: 505 case T_CHAR: 506 case T_BYTE: 507 case T_SHORT: 508 case T_INT: 509 if (int_args < Argument::n_int_register_parameters_j) { 510 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 511 } else { 512 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 513 stk_args += 2; 514 } 515 break; 516 case T_VOID: 517 // halves of T_LONG or T_DOUBLE 518 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 519 regs[i].set_bad(); 520 break; 521 case T_LONG: 522 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 523 // fall through 524 case T_OBJECT: 525 case T_ARRAY: 526 case T_ADDRESS: 527 if (int_args < Argument::n_int_register_parameters_j) { 528 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 529 } else { 530 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 531 stk_args += 2; 532 } 533 break; 534 case T_FLOAT: 535 if (fp_args < Argument::n_float_register_parameters_j) { 536 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 537 } else { 538 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 539 stk_args += 2; 540 } 541 break; 542 case T_DOUBLE: 543 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 544 if (fp_args < Argument::n_float_register_parameters_j) { 545 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 546 } else { 547 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 548 stk_args += 2; 549 } 550 break; 551 default: 552 ShouldNotReachHere(); 553 break; 554 } 555 } 556 557 return align_up(stk_args, 2); 558 } 559 560 // Patch the callers callsite with entry to compiled code if it exists. 561 static void patch_callers_callsite(MacroAssembler *masm) { 562 Label L; 563 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 564 __ jcc(Assembler::equal, L); 565 566 // Save the current stack pointer 567 __ mov(r13, rsp); 568 // Schedule the branch target address early. 569 // Call into the VM to patch the caller, then jump to compiled callee 570 // rax isn't live so capture return address while we easily can 571 __ movptr(rax, Address(rsp, 0)); 572 573 // align stack so push_CPU_state doesn't fault 574 __ andptr(rsp, -(StackAlignmentInBytes)); 575 __ push_CPU_state(); 576 __ vzeroupper(); 577 // VM needs caller's callsite 578 // VM needs target method 579 // This needs to be a long call since we will relocate this adapter to 580 // the codeBuffer and it may not reach 581 582 // Allocate argument register save area 583 if (frame::arg_reg_save_area_bytes != 0) { 584 __ subptr(rsp, frame::arg_reg_save_area_bytes); 585 } 586 __ mov(c_rarg0, rbx); 587 __ mov(c_rarg1, rax); 588 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite))); 589 590 // De-allocate argument register save area 591 if (frame::arg_reg_save_area_bytes != 0) { 592 __ addptr(rsp, frame::arg_reg_save_area_bytes); 593 } 594 595 __ vzeroupper(); 596 __ pop_CPU_state(); 597 // restore sp 598 __ mov(rsp, r13); 599 __ bind(L); 600 } 601 602 603 static void gen_c2i_adapter(MacroAssembler *masm, 604 int total_args_passed, 605 int comp_args_on_stack, 606 const BasicType *sig_bt, 607 const VMRegPair *regs, 608 Label& skip_fixup) { 609 // Before we get into the guts of the C2I adapter, see if we should be here 610 // at all. We've come from compiled code and are attempting to jump to the 611 // interpreter, which means the caller made a static call to get here 612 // (vcalls always get a compiled target if there is one). Check for a 613 // compiled target. If there is one, we need to patch the caller's call. 614 patch_callers_callsite(masm); 615 616 __ bind(skip_fixup); 617 618 // Since all args are passed on the stack, total_args_passed * 619 // Interpreter::stackElementSize is the space we need. 620 621 assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed); 622 623 int extraspace = (total_args_passed * Interpreter::stackElementSize); 624 625 // stack is aligned, keep it that way 626 // This is not currently needed or enforced by the interpreter, but 627 // we might as well conform to the ABI. 628 extraspace = align_up(extraspace, 2*wordSize); 629 630 // set senderSP value 631 __ lea(r13, Address(rsp, wordSize)); 632 633 #ifdef ASSERT 634 __ check_stack_alignment(r13, "sender stack not aligned"); 635 #endif 636 if (extraspace > 0) { 637 // Pop the return address 638 __ pop(rax); 639 640 __ subptr(rsp, extraspace); 641 642 // Push the return address 643 __ push(rax); 644 645 // Account for the return address location since we store it first rather 646 // than hold it in a register across all the shuffling 647 extraspace += wordSize; 648 } 649 650 #ifdef ASSERT 651 __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax); 652 #endif 653 654 // Now write the args into the outgoing interpreter space 655 for (int i = 0; i < total_args_passed; i++) { 656 if (sig_bt[i] == T_VOID) { 657 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 658 continue; 659 } 660 661 // offset to start parameters 662 int st_off = (total_args_passed - i) * Interpreter::stackElementSize; 663 int next_off = st_off - Interpreter::stackElementSize; 664 665 // Say 4 args: 666 // i st_off 667 // 0 32 T_LONG 668 // 1 24 T_VOID 669 // 2 16 T_OBJECT 670 // 3 8 T_BOOL 671 // - 0 return address 672 // 673 // However to make thing extra confusing. Because we can fit a long/double in 674 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter 675 // leaves one slot empty and only stores to a single slot. In this case the 676 // slot that is occupied is the T_VOID slot. See I said it was confusing. 677 678 VMReg r_1 = regs[i].first(); 679 VMReg r_2 = regs[i].second(); 680 if (!r_1->is_valid()) { 681 assert(!r_2->is_valid(), ""); 682 continue; 683 } 684 if (r_1->is_stack()) { 685 // memory to memory use rax 686 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace; 687 if (!r_2->is_valid()) { 688 // sign extend?? 689 __ movl(rax, Address(rsp, ld_off)); 690 __ movptr(Address(rsp, st_off), rax); 691 692 } else { 693 694 __ movq(rax, Address(rsp, ld_off)); 695 696 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 697 // T_DOUBLE and T_LONG use two slots in the interpreter 698 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 699 // ld_off == LSW, ld_off+wordSize == MSW 700 // st_off == MSW, next_off == LSW 701 __ movq(Address(rsp, next_off), rax); 702 #ifdef ASSERT 703 // Overwrite the unused slot with known junk 704 __ mov64(rax, CONST64(0xdeadffffdeadaaaa)); 705 __ movptr(Address(rsp, st_off), rax); 706 #endif /* ASSERT */ 707 } else { 708 __ movq(Address(rsp, st_off), rax); 709 } 710 } 711 } else if (r_1->is_Register()) { 712 Register r = r_1->as_Register(); 713 if (!r_2->is_valid()) { 714 // must be only an int (or less ) so move only 32bits to slot 715 // why not sign extend?? 716 __ movl(Address(rsp, st_off), r); 717 } else { 718 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 719 // T_DOUBLE and T_LONG use two slots in the interpreter 720 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 721 // long/double in gpr 722 #ifdef ASSERT 723 // Overwrite the unused slot with known junk 724 __ mov64(rax, CONST64(0xdeadffffdeadaaab)); 725 __ movptr(Address(rsp, st_off), rax); 726 #endif /* ASSERT */ 727 __ movq(Address(rsp, next_off), r); 728 } else { 729 __ movptr(Address(rsp, st_off), r); 730 } 731 } 732 } else { 733 assert(r_1->is_XMMRegister(), ""); 734 if (!r_2->is_valid()) { 735 // only a float use just part of the slot 736 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister()); 737 } else { 738 #ifdef ASSERT 739 // Overwrite the unused slot with known junk 740 __ mov64(rax, CONST64(0xdeadffffdeadaaac)); 741 __ movptr(Address(rsp, st_off), rax); 742 #endif /* ASSERT */ 743 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister()); 744 } 745 } 746 } 747 748 // Schedule the branch target address early. 749 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset()))); 750 __ jmp(rcx); 751 } 752 753 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg, 754 address code_start, address code_end, 755 Label& L_ok) { 756 Label L_fail; 757 __ lea(temp_reg, ExternalAddress(code_start)); 758 __ cmpptr(pc_reg, temp_reg); 759 __ jcc(Assembler::belowEqual, L_fail); 760 __ lea(temp_reg, ExternalAddress(code_end)); 761 __ cmpptr(pc_reg, temp_reg); 762 __ jcc(Assembler::below, L_ok); 763 __ bind(L_fail); 764 } 765 766 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm, 767 int total_args_passed, 768 int comp_args_on_stack, 769 const BasicType *sig_bt, 770 const VMRegPair *regs) { 771 772 // Note: r13 contains the senderSP on entry. We must preserve it since 773 // we may do a i2c -> c2i transition if we lose a race where compiled 774 // code goes non-entrant while we get args ready. 775 // In addition we use r13 to locate all the interpreter args as 776 // we must align the stack to 16 bytes on an i2c entry else we 777 // lose alignment we expect in all compiled code and register 778 // save code can segv when fxsave instructions find improperly 779 // aligned stack pointer. 780 781 // Adapters can be frameless because they do not require the caller 782 // to perform additional cleanup work, such as correcting the stack pointer. 783 // An i2c adapter is frameless because the *caller* frame, which is interpreted, 784 // routinely repairs its own stack pointer (from interpreter_frame_last_sp), 785 // even if a callee has modified the stack pointer. 786 // A c2i adapter is frameless because the *callee* frame, which is interpreted, 787 // routinely repairs its caller's stack pointer (from sender_sp, which is set 788 // up via the senderSP register). 789 // In other words, if *either* the caller or callee is interpreted, we can 790 // get the stack pointer repaired after a call. 791 // This is why c2i and i2c adapters cannot be indefinitely composed. 792 // In particular, if a c2i adapter were to somehow call an i2c adapter, 793 // both caller and callee would be compiled methods, and neither would 794 // clean up the stack pointer changes performed by the two adapters. 795 // If this happens, control eventually transfers back to the compiled 796 // caller, but with an uncorrected stack, causing delayed havoc. 797 798 if (VerifyAdapterCalls && 799 (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) { 800 // So, let's test for cascading c2i/i2c adapters right now. 801 // assert(Interpreter::contains($return_addr) || 802 // StubRoutines::contains($return_addr), 803 // "i2c adapter must return to an interpreter frame"); 804 __ block_comment("verify_i2c { "); 805 // Pick up the return address 806 __ movptr(rax, Address(rsp, 0)); 807 Label L_ok; 808 if (Interpreter::code() != NULL) 809 range_check(masm, rax, r11, 810 Interpreter::code()->code_start(), Interpreter::code()->code_end(), 811 L_ok); 812 if (StubRoutines::code1() != NULL) 813 range_check(masm, rax, r11, 814 StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(), 815 L_ok); 816 if (StubRoutines::code2() != NULL) 817 range_check(masm, rax, r11, 818 StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(), 819 L_ok); 820 const char* msg = "i2c adapter must return to an interpreter frame"; 821 __ block_comment(msg); 822 __ stop(msg); 823 __ bind(L_ok); 824 __ block_comment("} verify_i2ce "); 825 } 826 827 // Must preserve original SP for loading incoming arguments because 828 // we need to align the outgoing SP for compiled code. 829 __ movptr(r11, rsp); 830 831 // Pick up the return address 832 __ pop(rax); 833 834 // Convert 4-byte c2 stack slots to words. 835 int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord; 836 837 if (comp_args_on_stack) { 838 __ subptr(rsp, comp_words_on_stack * wordSize); 839 } 840 841 // Ensure compiled code always sees stack at proper alignment 842 __ andptr(rsp, -16); 843 844 // push the return address and misalign the stack that youngest frame always sees 845 // as far as the placement of the call instruction 846 __ push(rax); 847 848 // Put saved SP in another register 849 const Register saved_sp = rax; 850 __ movptr(saved_sp, r11); 851 852 // Will jump to the compiled code just as if compiled code was doing it. 853 // Pre-load the register-jump target early, to schedule it better. 854 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset()))); 855 856 #if INCLUDE_JVMCI 857 if (EnableJVMCI) { 858 // check if this call should be routed towards a specific entry point 859 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 860 Label no_alternative_target; 861 __ jcc(Assembler::equal, no_alternative_target); 862 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset()))); 863 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 864 __ bind(no_alternative_target); 865 } 866 #endif // INCLUDE_JVMCI 867 868 // Now generate the shuffle code. Pick up all register args and move the 869 // rest through the floating point stack top. 870 for (int i = 0; i < total_args_passed; i++) { 871 if (sig_bt[i] == T_VOID) { 872 // Longs and doubles are passed in native word order, but misaligned 873 // in the 32-bit build. 874 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 875 continue; 876 } 877 878 // Pick up 0, 1 or 2 words from SP+offset. 879 880 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), 881 "scrambled load targets?"); 882 // Load in argument order going down. 883 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize; 884 // Point to interpreter value (vs. tag) 885 int next_off = ld_off - Interpreter::stackElementSize; 886 // 887 // 888 // 889 VMReg r_1 = regs[i].first(); 890 VMReg r_2 = regs[i].second(); 891 if (!r_1->is_valid()) { 892 assert(!r_2->is_valid(), ""); 893 continue; 894 } 895 if (r_1->is_stack()) { 896 // Convert stack slot to an SP offset (+ wordSize to account for return address ) 897 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize; 898 899 // We can use r13 as a temp here because compiled code doesn't need r13 as an input 900 // and if we end up going thru a c2i because of a miss a reasonable value of r13 901 // will be generated. 902 if (!r_2->is_valid()) { 903 // sign extend??? 904 __ movl(r13, Address(saved_sp, ld_off)); 905 __ movptr(Address(rsp, st_off), r13); 906 } else { 907 // 908 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 909 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 910 // So we must adjust where to pick up the data to match the interpreter. 911 // 912 // Interpreter local[n] == MSW, local[n+1] == LSW however locals 913 // are accessed as negative so LSW is at LOW address 914 915 // ld_off is MSW so get LSW 916 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 917 next_off : ld_off; 918 __ movq(r13, Address(saved_sp, offset)); 919 // st_off is LSW (i.e. reg.first()) 920 __ movq(Address(rsp, st_off), r13); 921 } 922 } else if (r_1->is_Register()) { // Register argument 923 Register r = r_1->as_Register(); 924 assert(r != rax, "must be different"); 925 if (r_2->is_valid()) { 926 // 927 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 928 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 929 // So we must adjust where to pick up the data to match the interpreter. 930 931 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 932 next_off : ld_off; 933 934 // this can be a misaligned move 935 __ movq(r, Address(saved_sp, offset)); 936 } else { 937 // sign extend and use a full word? 938 __ movl(r, Address(saved_sp, ld_off)); 939 } 940 } else { 941 if (!r_2->is_valid()) { 942 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off)); 943 } else { 944 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off)); 945 } 946 } 947 } 948 949 __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about 950 951 // 6243940 We might end up in handle_wrong_method if 952 // the callee is deoptimized as we race thru here. If that 953 // happens we don't want to take a safepoint because the 954 // caller frame will look interpreted and arguments are now 955 // "compiled" so it is much better to make this transition 956 // invisible to the stack walking code. Unfortunately if 957 // we try and find the callee by normal means a safepoint 958 // is possible. So we stash the desired callee in the thread 959 // and the vm will find there should this case occur. 960 961 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx); 962 963 // put Method* where a c2i would expect should we end up there 964 // only needed because eof c2 resolve stubs return Method* as a result in 965 // rax 966 __ mov(rax, rbx); 967 __ jmp(r11); 968 } 969 970 // --------------------------------------------------------------- 971 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm, 972 int total_args_passed, 973 int comp_args_on_stack, 974 const BasicType *sig_bt, 975 const VMRegPair *regs, 976 AdapterFingerPrint* fingerprint) { 977 address i2c_entry = __ pc(); 978 979 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs); 980 981 // ------------------------------------------------------------------------- 982 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls 983 // to the interpreter. The args start out packed in the compiled layout. They 984 // need to be unpacked into the interpreter layout. This will almost always 985 // require some stack space. We grow the current (compiled) stack, then repack 986 // the args. We finally end in a jump to the generic interpreter entry point. 987 // On exit from the interpreter, the interpreter will restore our SP (lest the 988 // compiled code, which relies solely on SP and not RBP, get sick). 989 990 address c2i_unverified_entry = __ pc(); 991 Label skip_fixup; 992 Label ok; 993 994 Register holder = rax; 995 Register receiver = j_rarg0; 996 Register temp = rbx; 997 998 { 999 __ load_klass(temp, receiver, rscratch1); 1000 __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset())); 1001 __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset())); 1002 __ jcc(Assembler::equal, ok); 1003 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1004 1005 __ bind(ok); 1006 // Method might have been compiled since the call site was patched to 1007 // interpreted if that is the case treat it as a miss so we can get 1008 // the call site corrected. 1009 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 1010 __ jcc(Assembler::equal, skip_fixup); 1011 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1012 } 1013 1014 address c2i_entry = __ pc(); 1015 1016 // Class initialization barrier for static methods 1017 address c2i_no_clinit_check_entry = NULL; 1018 if (VM_Version::supports_fast_class_init_checks()) { 1019 Label L_skip_barrier; 1020 Register method = rbx; 1021 1022 { // Bypass the barrier for non-static methods 1023 Register flags = rscratch1; 1024 __ movl(flags, Address(method, Method::access_flags_offset())); 1025 __ testl(flags, JVM_ACC_STATIC); 1026 __ jcc(Assembler::zero, L_skip_barrier); // non-static 1027 } 1028 1029 Register klass = rscratch1; 1030 __ load_method_holder(klass, method); 1031 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1032 1033 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1034 1035 __ bind(L_skip_barrier); 1036 c2i_no_clinit_check_entry = __ pc(); 1037 } 1038 1039 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1040 bs->c2i_entry_barrier(masm); 1041 1042 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup); 1043 1044 __ flush(); 1045 return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry); 1046 } 1047 1048 int SharedRuntime::c_calling_convention(const BasicType *sig_bt, 1049 VMRegPair *regs, 1050 VMRegPair *regs2, 1051 int total_args_passed) { 1052 assert(regs2 == NULL, "not needed on x86"); 1053 // We return the amount of VMRegImpl stack slots we need to reserve for all 1054 // the arguments NOT counting out_preserve_stack_slots. 1055 1056 // NOTE: These arrays will have to change when c1 is ported 1057 #ifdef _WIN64 1058 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1059 c_rarg0, c_rarg1, c_rarg2, c_rarg3 1060 }; 1061 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1062 c_farg0, c_farg1, c_farg2, c_farg3 1063 }; 1064 #else 1065 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1066 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5 1067 }; 1068 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1069 c_farg0, c_farg1, c_farg2, c_farg3, 1070 c_farg4, c_farg5, c_farg6, c_farg7 1071 }; 1072 #endif // _WIN64 1073 1074 1075 uint int_args = 0; 1076 uint fp_args = 0; 1077 uint stk_args = 0; // inc by 2 each time 1078 1079 for (int i = 0; i < total_args_passed; i++) { 1080 switch (sig_bt[i]) { 1081 case T_BOOLEAN: 1082 case T_CHAR: 1083 case T_BYTE: 1084 case T_SHORT: 1085 case T_INT: 1086 if (int_args < Argument::n_int_register_parameters_c) { 1087 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 1088 #ifdef _WIN64 1089 fp_args++; 1090 // Allocate slots for callee to stuff register args the stack. 1091 stk_args += 2; 1092 #endif 1093 } else { 1094 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1095 stk_args += 2; 1096 } 1097 break; 1098 case T_LONG: 1099 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1100 // fall through 1101 case T_OBJECT: 1102 case T_ARRAY: 1103 case T_ADDRESS: 1104 case T_METADATA: 1105 if (int_args < Argument::n_int_register_parameters_c) { 1106 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 1107 #ifdef _WIN64 1108 fp_args++; 1109 stk_args += 2; 1110 #endif 1111 } else { 1112 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1113 stk_args += 2; 1114 } 1115 break; 1116 case T_FLOAT: 1117 if (fp_args < Argument::n_float_register_parameters_c) { 1118 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 1119 #ifdef _WIN64 1120 int_args++; 1121 // Allocate slots for callee to stuff register args the stack. 1122 stk_args += 2; 1123 #endif 1124 } else { 1125 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1126 stk_args += 2; 1127 } 1128 break; 1129 case T_DOUBLE: 1130 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1131 if (fp_args < Argument::n_float_register_parameters_c) { 1132 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 1133 #ifdef _WIN64 1134 int_args++; 1135 // Allocate slots for callee to stuff register args the stack. 1136 stk_args += 2; 1137 #endif 1138 } else { 1139 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1140 stk_args += 2; 1141 } 1142 break; 1143 case T_VOID: // Halves of longs and doubles 1144 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 1145 regs[i].set_bad(); 1146 break; 1147 default: 1148 ShouldNotReachHere(); 1149 break; 1150 } 1151 } 1152 #ifdef _WIN64 1153 // windows abi requires that we always allocate enough stack space 1154 // for 4 64bit registers to be stored down. 1155 if (stk_args < 8) { 1156 stk_args = 8; 1157 } 1158 #endif // _WIN64 1159 1160 return stk_args; 1161 } 1162 1163 int SharedRuntime::vector_calling_convention(VMRegPair *regs, 1164 uint num_bits, 1165 uint total_args_passed) { 1166 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512, 1167 "only certain vector sizes are supported for now"); 1168 1169 static const XMMRegister VEC_ArgReg[32] = { 1170 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 1171 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1172 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23, 1173 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31 1174 }; 1175 1176 uint stk_args = 0; 1177 uint fp_args = 0; 1178 1179 for (uint i = 0; i < total_args_passed; i++) { 1180 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg(); 1181 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15)); 1182 regs[i].set_pair(vmreg->next(next_val), vmreg); 1183 } 1184 1185 return stk_args; 1186 } 1187 1188 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1189 // We always ignore the frame_slots arg and just use the space just below frame pointer 1190 // which by this time is free to use 1191 switch (ret_type) { 1192 case T_FLOAT: 1193 __ movflt(Address(rbp, -wordSize), xmm0); 1194 break; 1195 case T_DOUBLE: 1196 __ movdbl(Address(rbp, -wordSize), xmm0); 1197 break; 1198 case T_VOID: break; 1199 default: { 1200 __ movptr(Address(rbp, -wordSize), rax); 1201 } 1202 } 1203 } 1204 1205 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1206 // We always ignore the frame_slots arg and just use the space just below frame pointer 1207 // which by this time is free to use 1208 switch (ret_type) { 1209 case T_FLOAT: 1210 __ movflt(xmm0, Address(rbp, -wordSize)); 1211 break; 1212 case T_DOUBLE: 1213 __ movdbl(xmm0, Address(rbp, -wordSize)); 1214 break; 1215 case T_VOID: break; 1216 default: { 1217 __ movptr(rax, Address(rbp, -wordSize)); 1218 } 1219 } 1220 } 1221 1222 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1223 for ( int i = first_arg ; i < arg_count ; i++ ) { 1224 if (args[i].first()->is_Register()) { 1225 __ push(args[i].first()->as_Register()); 1226 } else if (args[i].first()->is_XMMRegister()) { 1227 __ subptr(rsp, 2*wordSize); 1228 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister()); 1229 } 1230 } 1231 } 1232 1233 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1234 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) { 1235 if (args[i].first()->is_Register()) { 1236 __ pop(args[i].first()->as_Register()); 1237 } else if (args[i].first()->is_XMMRegister()) { 1238 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0)); 1239 __ addptr(rsp, 2*wordSize); 1240 } 1241 } 1242 } 1243 1244 static void verify_oop_args(MacroAssembler* masm, 1245 const methodHandle& method, 1246 const BasicType* sig_bt, 1247 const VMRegPair* regs) { 1248 Register temp_reg = rbx; // not part of any compiled calling seq 1249 if (VerifyOops) { 1250 for (int i = 0; i < method->size_of_parameters(); i++) { 1251 if (is_reference_type(sig_bt[i])) { 1252 VMReg r = regs[i].first(); 1253 assert(r->is_valid(), "bad oop arg"); 1254 if (r->is_stack()) { 1255 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1256 __ verify_oop(temp_reg); 1257 } else { 1258 __ verify_oop(r->as_Register()); 1259 } 1260 } 1261 } 1262 } 1263 } 1264 1265 static void check_continuation_enter_argument(VMReg actual_vmreg, 1266 Register expected_reg, 1267 const char* name) { 1268 assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name); 1269 assert(actual_vmreg->as_Register() == expected_reg, 1270 "%s is in unexpected register: %s instead of %s", 1271 name, actual_vmreg->as_Register()->name(), expected_reg->name()); 1272 } 1273 1274 1275 //---------------------------- continuation_enter_setup --------------------------- 1276 // 1277 // Arguments: 1278 // None. 1279 // 1280 // Results: 1281 // rsp: pointer to blank ContinuationEntry 1282 // 1283 // Kills: 1284 // rax 1285 // 1286 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) { 1287 assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, ""); 1288 assert(in_bytes(ContinuationEntry::cont_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1289 assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1290 1291 stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize; 1292 __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1293 1294 int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size; 1295 OopMap* map = new OopMap(frame_size, 0); 1296 1297 __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset())); 1298 __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax); 1299 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp); 1300 1301 return map; 1302 } 1303 1304 //---------------------------- fill_continuation_entry --------------------------- 1305 // 1306 // Arguments: 1307 // rsp: pointer to blank Continuation entry 1308 // reg_cont_obj: pointer to the continuation 1309 // reg_flags: flags 1310 // 1311 // Results: 1312 // rsp: pointer to filled out ContinuationEntry 1313 // 1314 // Kills: 1315 // rax 1316 // 1317 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) { 1318 assert_different_registers(rax, reg_cont_obj, reg_flags); 1319 #ifdef ASSERT 1320 __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value()); 1321 #endif 1322 __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj); 1323 __ movl (Address(rsp, ContinuationEntry::flags_offset()), reg_flags); 1324 __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0); 1325 __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0); 1326 __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0); 1327 1328 __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset())); 1329 __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax); 1330 __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset())); 1331 __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax); 1332 1333 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0); 1334 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0); 1335 } 1336 1337 //---------------------------- continuation_enter_cleanup --------------------------- 1338 // 1339 // Arguments: 1340 // rsp: pointer to the ContinuationEntry 1341 // 1342 // Results: 1343 // rsp: pointer to the spilled rbp in the entry frame 1344 // 1345 // Kills: 1346 // rbx 1347 // 1348 void static continuation_enter_cleanup(MacroAssembler* masm) { 1349 #ifdef ASSERT 1350 Label L_good_sp; 1351 __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1352 __ jcc(Assembler::equal, L_good_sp); 1353 __ stop("Incorrect rsp at continuation_enter_cleanup"); 1354 __ bind(L_good_sp); 1355 #endif 1356 1357 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset())); 1358 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx); 1359 __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset())); 1360 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx); 1361 1362 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset())); 1363 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx); 1364 __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1365 } 1366 1367 static void gen_continuation_enter(MacroAssembler* masm, 1368 const VMRegPair* regs, 1369 int& exception_offset, 1370 OopMapSet* oop_maps, 1371 int& frame_complete, 1372 int& stack_slots, 1373 int& interpreted_entry_offset, 1374 int& compiled_entry_offset) { 1375 1376 // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread) 1377 int pos_cont_obj = 0; 1378 int pos_is_cont = 1; 1379 int pos_is_virtual = 2; 1380 1381 // The platform-specific calling convention may present the arguments in various registers. 1382 // To simplify the rest of the code, we expect the arguments to reside at these known 1383 // registers, and we additionally check the placement here in case calling convention ever 1384 // changes. 1385 Register reg_cont_obj = c_rarg1; 1386 Register reg_is_cont = c_rarg2; 1387 Register reg_is_virtual = c_rarg3; 1388 1389 check_continuation_enter_argument(regs[pos_cont_obj].first(), reg_cont_obj, "Continuation object"); 1390 check_continuation_enter_argument(regs[pos_is_cont].first(), reg_is_cont, "isContinue"); 1391 check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread"); 1392 1393 // Utility methods kill rax, make sure there are no collisions 1394 assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual); 1395 1396 AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(), 1397 relocInfo::static_call_type); 1398 1399 address start = __ pc(); 1400 1401 Label L_thaw, L_exit; 1402 1403 // i2i entry used at interp_only_mode only 1404 interpreted_entry_offset = __ pc() - start; 1405 { 1406 #ifdef ASSERT 1407 Label is_interp_only; 1408 __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0); 1409 __ jcc(Assembler::notEqual, is_interp_only); 1410 __ stop("enterSpecial interpreter entry called when not in interp_only_mode"); 1411 __ bind(is_interp_only); 1412 #endif 1413 1414 __ pop(rax); // return address 1415 // Read interpreter arguments into registers (this is an ad-hoc i2c adapter) 1416 __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2)); 1417 __ movl(c_rarg2, Address(rsp, Interpreter::stackElementSize*1)); 1418 __ movl(c_rarg3, Address(rsp, Interpreter::stackElementSize*0)); 1419 __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment 1420 __ push(rax); // return address 1421 __ push_cont_fastpath(); 1422 1423 __ enter(); 1424 1425 stack_slots = 2; // will be adjusted in setup 1426 OopMap* map = continuation_enter_setup(masm, stack_slots); 1427 // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe, 1428 // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway. 1429 1430 __ verify_oop(reg_cont_obj); 1431 1432 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1433 1434 // If continuation, call to thaw. Otherwise, resolve the call and exit. 1435 __ testptr(reg_is_cont, reg_is_cont); 1436 __ jcc(Assembler::notZero, L_thaw); 1437 1438 // --- Resolve path 1439 1440 // Make sure the call is patchable 1441 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1442 // Emit stub for static call 1443 CodeBuffer* cbuf = masm->code_section()->outer(); 1444 address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, __ pc()); 1445 if (stub == nullptr) { 1446 fatal("CodeCache is full at gen_continuation_enter"); 1447 } 1448 __ call(resolve); 1449 oop_maps->add_gc_map(__ pc() - start, map); 1450 __ post_call_nop(); 1451 1452 __ jmp(L_exit); 1453 } 1454 1455 // compiled entry 1456 __ align(CodeEntryAlignment); 1457 compiled_entry_offset = __ pc() - start; 1458 __ enter(); 1459 1460 stack_slots = 2; // will be adjusted in setup 1461 OopMap* map = continuation_enter_setup(masm, stack_slots); 1462 1463 // Frame is now completed as far as size and linkage. 1464 frame_complete = __ pc() - start; 1465 1466 __ verify_oop(reg_cont_obj); 1467 1468 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1469 1470 // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue) 1471 __ testptr(reg_is_cont, reg_is_cont); 1472 __ jccb(Assembler::notZero, L_thaw); 1473 1474 // --- call Continuation.enter(Continuation c, boolean isContinue) 1475 1476 // Make sure the call is patchable 1477 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1478 1479 // Emit stub for static call 1480 CodeBuffer* cbuf = masm->code_section()->outer(); 1481 address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, __ pc()); 1482 if (stub == nullptr) { 1483 fatal("CodeCache is full at gen_continuation_enter"); 1484 } 1485 1486 // The call needs to be resolved. There's a special case for this in 1487 // SharedRuntime::find_callee_info_helper() which calls 1488 // LinkResolver::resolve_continuation_enter() which resolves the call to 1489 // Continuation.enter(Continuation c, boolean isContinue). 1490 __ call(resolve); 1491 1492 oop_maps->add_gc_map(__ pc() - start, map); 1493 __ post_call_nop(); 1494 1495 __ jmpb(L_exit); 1496 1497 // --- Thawing path 1498 1499 __ bind(L_thaw); 1500 1501 __ call(RuntimeAddress(StubRoutines::cont_thaw())); 1502 1503 ContinuationEntry::_return_pc_offset = __ pc() - start; 1504 oop_maps->add_gc_map(__ pc() - start, map->deep_copy()); 1505 __ post_call_nop(); 1506 1507 // --- Normal exit (resolve/thawing) 1508 1509 __ bind(L_exit); 1510 1511 continuation_enter_cleanup(masm); 1512 __ pop(rbp); 1513 __ ret(0); 1514 1515 // --- Exception handling path 1516 1517 exception_offset = __ pc() - start; 1518 1519 continuation_enter_cleanup(masm); 1520 __ pop(rbp); 1521 1522 __ movptr(c_rarg0, r15_thread); 1523 __ movptr(c_rarg1, Address(rsp, 0)); // return address 1524 1525 // rax still holds the original exception oop, save it before the call 1526 __ push(rax); 1527 1528 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2); 1529 __ movptr(rbx, rax); 1530 1531 // Continue at exception handler: 1532 // rax: exception oop 1533 // rbx: exception handler 1534 // rdx: exception pc 1535 __ pop(rax); 1536 __ verify_oop(rax); 1537 __ pop(rdx); 1538 __ jmp(rbx); 1539 } 1540 1541 static void gen_continuation_yield(MacroAssembler* masm, 1542 const VMRegPair* regs, 1543 OopMapSet* oop_maps, 1544 int& frame_complete, 1545 int& stack_slots, 1546 int& compiled_entry_offset) { 1547 enum layout { 1548 rbp_off, 1549 rbpH_off, 1550 return_off, 1551 return_off2, 1552 framesize // inclusive of return address 1553 }; 1554 stack_slots = framesize / VMRegImpl::slots_per_word; 1555 assert(stack_slots == 2, "recheck layout"); 1556 1557 address start = __ pc(); 1558 compiled_entry_offset = __ pc() - start; 1559 __ enter(); 1560 address the_pc = __ pc(); 1561 1562 frame_complete = the_pc - start; 1563 1564 // This nop must be exactly at the PC we push into the frame info. 1565 // We use this nop for fast CodeBlob lookup, associate the OopMap 1566 // with it right away. 1567 __ post_call_nop(); 1568 OopMap* map = new OopMap(framesize, 1); 1569 oop_maps->add_gc_map(frame_complete, map); 1570 1571 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 1572 __ movptr(c_rarg0, r15_thread); 1573 __ movptr(c_rarg1, rsp); 1574 __ call_VM_leaf(Continuation::freeze_entry(), 2); 1575 __ reset_last_Java_frame(true); 1576 1577 Label L_pinned; 1578 1579 __ testptr(rax, rax); 1580 __ jcc(Assembler::notZero, L_pinned); 1581 1582 __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1583 continuation_enter_cleanup(masm); 1584 __ pop(rbp); 1585 __ ret(0); 1586 1587 __ bind(L_pinned); 1588 1589 // Pinned, return to caller 1590 1591 // handle pending exception thrown by freeze 1592 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 1593 Label ok; 1594 __ jcc(Assembler::equal, ok); 1595 __ leave(); 1596 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 1597 __ bind(ok); 1598 1599 __ leave(); 1600 __ ret(0); 1601 } 1602 1603 static void gen_special_dispatch(MacroAssembler* masm, 1604 const methodHandle& method, 1605 const BasicType* sig_bt, 1606 const VMRegPair* regs) { 1607 verify_oop_args(masm, method, sig_bt, regs); 1608 vmIntrinsics::ID iid = method->intrinsic_id(); 1609 1610 // Now write the args into the outgoing interpreter space 1611 bool has_receiver = false; 1612 Register receiver_reg = noreg; 1613 int member_arg_pos = -1; 1614 Register member_reg = noreg; 1615 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid); 1616 if (ref_kind != 0) { 1617 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument 1618 member_reg = rbx; // known to be free at this point 1619 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind); 1620 } else if (iid == vmIntrinsics::_invokeBasic) { 1621 has_receiver = true; 1622 } else if (iid == vmIntrinsics::_linkToNative) { 1623 member_arg_pos = method->size_of_parameters() - 1; // trailing NativeEntryPoint argument 1624 member_reg = rbx; // known to be free at this point 1625 } else { 1626 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid)); 1627 } 1628 1629 if (member_reg != noreg) { 1630 // Load the member_arg into register, if necessary. 1631 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs); 1632 VMReg r = regs[member_arg_pos].first(); 1633 if (r->is_stack()) { 1634 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1635 } else { 1636 // no data motion is needed 1637 member_reg = r->as_Register(); 1638 } 1639 } 1640 1641 if (has_receiver) { 1642 // Make sure the receiver is loaded into a register. 1643 assert(method->size_of_parameters() > 0, "oob"); 1644 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object"); 1645 VMReg r = regs[0].first(); 1646 assert(r->is_valid(), "bad receiver arg"); 1647 if (r->is_stack()) { 1648 // Porting note: This assumes that compiled calling conventions always 1649 // pass the receiver oop in a register. If this is not true on some 1650 // platform, pick a temp and load the receiver from stack. 1651 fatal("receiver always in a register"); 1652 receiver_reg = j_rarg0; // known to be free at this point 1653 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1654 } else { 1655 // no data motion is needed 1656 receiver_reg = r->as_Register(); 1657 } 1658 } 1659 1660 // Figure out which address we are really jumping to: 1661 MethodHandles::generate_method_handle_dispatch(masm, iid, 1662 receiver_reg, member_reg, /*for_compiler_entry:*/ true); 1663 } 1664 1665 // --------------------------------------------------------------------------- 1666 // Generate a native wrapper for a given method. The method takes arguments 1667 // in the Java compiled code convention, marshals them to the native 1668 // convention (handlizes oops, etc), transitions to native, makes the call, 1669 // returns to java state (possibly blocking), unhandlizes any result and 1670 // returns. 1671 // 1672 // Critical native functions are a shorthand for the use of 1673 // GetPrimtiveArrayCritical and disallow the use of any other JNI 1674 // functions. The wrapper is expected to unpack the arguments before 1675 // passing them to the callee. Critical native functions leave the state _in_Java, 1676 // since they cannot stop for GC. 1677 // Some other parts of JNI setup are skipped like the tear down of the JNI handle 1678 // block and the check for pending exceptions it's impossible for them 1679 // to be thrown. 1680 // 1681 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm, 1682 const methodHandle& method, 1683 int compile_id, 1684 BasicType* in_sig_bt, 1685 VMRegPair* in_regs, 1686 BasicType ret_type) { 1687 if (method->is_continuation_native_intrinsic()) { 1688 int exception_offset = -1; 1689 OopMapSet* oop_maps = new OopMapSet(); 1690 int frame_complete = -1; 1691 int stack_slots = -1; 1692 int interpreted_entry_offset = -1; 1693 int vep_offset = -1; 1694 if (method->is_continuation_enter_intrinsic()) { 1695 gen_continuation_enter(masm, 1696 in_regs, 1697 exception_offset, 1698 oop_maps, 1699 frame_complete, 1700 stack_slots, 1701 interpreted_entry_offset, 1702 vep_offset); 1703 } else if (method->is_continuation_yield_intrinsic()) { 1704 gen_continuation_yield(masm, 1705 in_regs, 1706 oop_maps, 1707 frame_complete, 1708 stack_slots, 1709 vep_offset); 1710 } else { 1711 guarantee(false, "Unknown Continuation native intrinsic"); 1712 } 1713 1714 #ifdef ASSERT 1715 if (method->is_continuation_enter_intrinsic()) { 1716 assert(interpreted_entry_offset != -1, "Must be set"); 1717 assert(exception_offset != -1, "Must be set"); 1718 } else { 1719 assert(interpreted_entry_offset == -1, "Must be unset"); 1720 assert(exception_offset == -1, "Must be unset"); 1721 } 1722 assert(frame_complete != -1, "Must be set"); 1723 assert(stack_slots != -1, "Must be set"); 1724 assert(vep_offset != -1, "Must be set"); 1725 #endif 1726 1727 __ flush(); 1728 nmethod* nm = nmethod::new_native_nmethod(method, 1729 compile_id, 1730 masm->code(), 1731 vep_offset, 1732 frame_complete, 1733 stack_slots, 1734 in_ByteSize(-1), 1735 in_ByteSize(-1), 1736 oop_maps, 1737 exception_offset); 1738 if (method->is_continuation_enter_intrinsic()) { 1739 ContinuationEntry::set_enter_code(nm, interpreted_entry_offset); 1740 } else if (method->is_continuation_yield_intrinsic()) { 1741 _cont_doYield_stub = nm; 1742 } 1743 return nm; 1744 } 1745 1746 if (method->is_method_handle_intrinsic()) { 1747 vmIntrinsics::ID iid = method->intrinsic_id(); 1748 intptr_t start = (intptr_t)__ pc(); 1749 int vep_offset = ((intptr_t)__ pc()) - start; 1750 gen_special_dispatch(masm, 1751 method, 1752 in_sig_bt, 1753 in_regs); 1754 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period 1755 __ flush(); 1756 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually 1757 return nmethod::new_native_nmethod(method, 1758 compile_id, 1759 masm->code(), 1760 vep_offset, 1761 frame_complete, 1762 stack_slots / VMRegImpl::slots_per_word, 1763 in_ByteSize(-1), 1764 in_ByteSize(-1), 1765 (OopMapSet*)NULL); 1766 } 1767 address native_func = method->native_function(); 1768 assert(native_func != NULL, "must have function"); 1769 1770 // An OopMap for lock (and class if static) 1771 OopMapSet *oop_maps = new OopMapSet(); 1772 intptr_t start = (intptr_t)__ pc(); 1773 1774 // We have received a description of where all the java arg are located 1775 // on entry to the wrapper. We need to convert these args to where 1776 // the jni function will expect them. To figure out where they go 1777 // we convert the java signature to a C signature by inserting 1778 // the hidden arguments as arg[0] and possibly arg[1] (static method) 1779 1780 const int total_in_args = method->size_of_parameters(); 1781 int total_c_args = total_in_args + (method->is_static() ? 2 : 1); 1782 1783 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args); 1784 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args); 1785 BasicType* in_elem_bt = NULL; 1786 1787 int argc = 0; 1788 out_sig_bt[argc++] = T_ADDRESS; 1789 if (method->is_static()) { 1790 out_sig_bt[argc++] = T_OBJECT; 1791 } 1792 1793 for (int i = 0; i < total_in_args ; i++ ) { 1794 out_sig_bt[argc++] = in_sig_bt[i]; 1795 } 1796 1797 // Now figure out where the args must be stored and how much stack space 1798 // they require. 1799 int out_arg_slots; 1800 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args); 1801 1802 // Compute framesize for the wrapper. We need to handlize all oops in 1803 // incoming registers 1804 1805 // Calculate the total number of stack slots we will need. 1806 1807 // First count the abi requirement plus all of the outgoing args 1808 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots; 1809 1810 // Now the space for the inbound oop handle area 1811 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers 1812 1813 int oop_handle_offset = stack_slots; 1814 stack_slots += total_save_slots; 1815 1816 // Now any space we need for handlizing a klass if static method 1817 1818 int klass_slot_offset = 0; 1819 int klass_offset = -1; 1820 int lock_slot_offset = 0; 1821 bool is_static = false; 1822 1823 if (method->is_static()) { 1824 klass_slot_offset = stack_slots; 1825 stack_slots += VMRegImpl::slots_per_word; 1826 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size; 1827 is_static = true; 1828 } 1829 1830 // Plus a lock if needed 1831 1832 if (method->is_synchronized()) { 1833 lock_slot_offset = stack_slots; 1834 stack_slots += VMRegImpl::slots_per_word; 1835 } 1836 1837 // Now a place (+2) to save return values or temp during shuffling 1838 // + 4 for return address (which we own) and saved rbp 1839 stack_slots += 6; 1840 1841 // Ok The space we have allocated will look like: 1842 // 1843 // 1844 // FP-> | | 1845 // |---------------------| 1846 // | 2 slots for moves | 1847 // |---------------------| 1848 // | lock box (if sync) | 1849 // |---------------------| <- lock_slot_offset 1850 // | klass (if static) | 1851 // |---------------------| <- klass_slot_offset 1852 // | oopHandle area | 1853 // |---------------------| <- oop_handle_offset (6 java arg registers) 1854 // | outbound memory | 1855 // | based arguments | 1856 // | | 1857 // |---------------------| 1858 // | | 1859 // SP-> | out_preserved_slots | 1860 // 1861 // 1862 1863 1864 // Now compute actual number of stack words we need rounding to make 1865 // stack properly aligned. 1866 stack_slots = align_up(stack_slots, StackAlignmentInSlots); 1867 1868 int stack_size = stack_slots * VMRegImpl::stack_slot_size; 1869 1870 // First thing make an ic check to see if we should even be here 1871 1872 // We are free to use all registers as temps without saving them and 1873 // restoring them except rbp. rbp is the only callee save register 1874 // as far as the interpreter and the compiler(s) are concerned. 1875 1876 1877 const Register ic_reg = rax; 1878 const Register receiver = j_rarg0; 1879 1880 Label hit; 1881 Label exception_pending; 1882 1883 assert_different_registers(ic_reg, receiver, rscratch1, rscratch2); 1884 __ verify_oop(receiver); 1885 __ load_klass(rscratch1, receiver, rscratch2); 1886 __ cmpq(ic_reg, rscratch1); 1887 __ jcc(Assembler::equal, hit); 1888 1889 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1890 1891 // Verified entry point must be aligned 1892 __ align(8); 1893 1894 __ bind(hit); 1895 1896 int vep_offset = ((intptr_t)__ pc()) - start; 1897 1898 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) { 1899 Label L_skip_barrier; 1900 Register klass = r10; 1901 __ mov_metadata(klass, method->method_holder()); // InstanceKlass* 1902 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1903 1904 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1905 1906 __ bind(L_skip_barrier); 1907 } 1908 1909 #ifdef COMPILER1 1910 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available. 1911 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) { 1912 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/); 1913 } 1914 #endif // COMPILER1 1915 1916 // The instruction at the verified entry point must be 5 bytes or longer 1917 // because it can be patched on the fly by make_non_entrant. The stack bang 1918 // instruction fits that requirement. 1919 1920 // Generate stack overflow check 1921 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size()); 1922 1923 // Generate a new frame for the wrapper. 1924 __ enter(); 1925 // -2 because return address is already present and so is saved rbp 1926 __ subptr(rsp, stack_size - 2*wordSize); 1927 1928 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1929 // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub 1930 bs->nmethod_entry_barrier(masm, NULL /* slow_path */, NULL /* continuation */); 1931 1932 // Frame is now completed as far as size and linkage. 1933 int frame_complete = ((intptr_t)__ pc()) - start; 1934 1935 if (UseRTMLocking) { 1936 // Abort RTM transaction before calling JNI 1937 // because critical section will be large and will be 1938 // aborted anyway. Also nmethod could be deoptimized. 1939 __ xabort(0); 1940 } 1941 1942 #ifdef ASSERT 1943 __ check_stack_alignment(rsp, "improperly aligned stack"); 1944 #endif /* ASSERT */ 1945 1946 1947 // We use r14 as the oop handle for the receiver/klass 1948 // It is callee save so it survives the call to native 1949 1950 const Register oop_handle_reg = r14; 1951 1952 // 1953 // We immediately shuffle the arguments so that any vm call we have to 1954 // make from here on out (sync slow path, jvmti, etc.) we will have 1955 // captured the oops from our caller and have a valid oopMap for 1956 // them. 1957 1958 // ----------------- 1959 // The Grand Shuffle 1960 1961 // The Java calling convention is either equal (linux) or denser (win64) than the 1962 // c calling convention. However the because of the jni_env argument the c calling 1963 // convention always has at least one more (and two for static) arguments than Java. 1964 // Therefore if we move the args from java -> c backwards then we will never have 1965 // a register->register conflict and we don't have to build a dependency graph 1966 // and figure out how to break any cycles. 1967 // 1968 1969 // Record esp-based slot for receiver on stack for non-static methods 1970 int receiver_offset = -1; 1971 1972 // This is a trick. We double the stack slots so we can claim 1973 // the oops in the caller's frame. Since we are sure to have 1974 // more args than the caller doubling is enough to make 1975 // sure we can capture all the incoming oop args from the 1976 // caller. 1977 // 1978 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/); 1979 1980 // Mark location of rbp (someday) 1981 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp)); 1982 1983 // Use eax, ebx as temporaries during any memory-memory moves we have to do 1984 // All inbound args are referenced based on rbp and all outbound args via rsp. 1985 1986 1987 #ifdef ASSERT 1988 bool reg_destroyed[Register::number_of_registers]; 1989 bool freg_destroyed[XMMRegister::number_of_registers]; 1990 for ( int r = 0 ; r < Register::number_of_registers ; r++ ) { 1991 reg_destroyed[r] = false; 1992 } 1993 for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) { 1994 freg_destroyed[f] = false; 1995 } 1996 1997 #endif /* ASSERT */ 1998 1999 // For JNI natives the incoming and outgoing registers are offset upwards. 2000 GrowableArray<int> arg_order(2 * total_in_args); 2001 2002 VMRegPair tmp_vmreg; 2003 tmp_vmreg.set2(rbx->as_VMReg()); 2004 2005 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 2006 arg_order.push(i); 2007 arg_order.push(c_arg); 2008 } 2009 2010 int temploc = -1; 2011 for (int ai = 0; ai < arg_order.length(); ai += 2) { 2012 int i = arg_order.at(ai); 2013 int c_arg = arg_order.at(ai + 1); 2014 __ block_comment(err_msg("move %d -> %d", i, c_arg)); 2015 #ifdef ASSERT 2016 if (in_regs[i].first()->is_Register()) { 2017 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!"); 2018 } else if (in_regs[i].first()->is_XMMRegister()) { 2019 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!"); 2020 } 2021 if (out_regs[c_arg].first()->is_Register()) { 2022 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 2023 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 2024 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 2025 } 2026 #endif /* ASSERT */ 2027 switch (in_sig_bt[i]) { 2028 case T_ARRAY: 2029 case T_OBJECT: 2030 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg], 2031 ((i == 0) && (!is_static)), 2032 &receiver_offset); 2033 break; 2034 case T_VOID: 2035 break; 2036 2037 case T_FLOAT: 2038 __ float_move(in_regs[i], out_regs[c_arg]); 2039 break; 2040 2041 case T_DOUBLE: 2042 assert( i + 1 < total_in_args && 2043 in_sig_bt[i + 1] == T_VOID && 2044 out_sig_bt[c_arg+1] == T_VOID, "bad arg list"); 2045 __ double_move(in_regs[i], out_regs[c_arg]); 2046 break; 2047 2048 case T_LONG : 2049 __ long_move(in_regs[i], out_regs[c_arg]); 2050 break; 2051 2052 case T_ADDRESS: assert(false, "found T_ADDRESS in java args"); 2053 2054 default: 2055 __ move32_64(in_regs[i], out_regs[c_arg]); 2056 } 2057 } 2058 2059 int c_arg; 2060 2061 // Pre-load a static method's oop into r14. Used both by locking code and 2062 // the normal JNI call code. 2063 // point c_arg at the first arg that is already loaded in case we 2064 // need to spill before we call out 2065 c_arg = total_c_args - total_in_args; 2066 2067 if (method->is_static()) { 2068 2069 // load oop into a register 2070 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror())); 2071 2072 // Now handlize the static class mirror it's known not-null. 2073 __ movptr(Address(rsp, klass_offset), oop_handle_reg); 2074 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset)); 2075 2076 // Now get the handle 2077 __ lea(oop_handle_reg, Address(rsp, klass_offset)); 2078 // store the klass handle as second argument 2079 __ movptr(c_rarg1, oop_handle_reg); 2080 // and protect the arg if we must spill 2081 c_arg--; 2082 } 2083 2084 // Change state to native (we save the return address in the thread, since it might not 2085 // be pushed on the stack when we do a stack traversal). It is enough that the pc() 2086 // points into the right code segment. It does not have to be the correct return pc. 2087 // We use the same pc/oopMap repeatedly when we call out 2088 2089 intptr_t the_pc = (intptr_t) __ pc(); 2090 oop_maps->add_gc_map(the_pc - start, map); 2091 2092 __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1); 2093 2094 2095 // We have all of the arguments setup at this point. We must not touch any register 2096 // argument registers at this point (what if we save/restore them there are no oop? 2097 2098 { 2099 SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1); 2100 // protect the args we've loaded 2101 save_args(masm, total_c_args, c_arg, out_regs); 2102 __ mov_metadata(c_rarg1, method()); 2103 __ call_VM_leaf( 2104 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry), 2105 r15_thread, c_rarg1); 2106 restore_args(masm, total_c_args, c_arg, out_regs); 2107 } 2108 2109 // RedefineClasses() tracing support for obsolete method entry 2110 if (log_is_enabled(Trace, redefine, class, obsolete)) { 2111 // protect the args we've loaded 2112 save_args(masm, total_c_args, c_arg, out_regs); 2113 __ mov_metadata(c_rarg1, method()); 2114 __ call_VM_leaf( 2115 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry), 2116 r15_thread, c_rarg1); 2117 restore_args(masm, total_c_args, c_arg, out_regs); 2118 } 2119 2120 // Lock a synchronized method 2121 2122 // Register definitions used by locking and unlocking 2123 2124 const Register swap_reg = rax; // Must use rax for cmpxchg instruction 2125 const Register obj_reg = rbx; // Will contain the oop 2126 const Register lock_reg = r13; // Address of compiler lock object (BasicLock) 2127 const Register old_hdr = r13; // value of old header at unlock time 2128 2129 Label slow_path_lock; 2130 Label lock_done; 2131 2132 if (method->is_synchronized()) { 2133 Label count_mon; 2134 2135 const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes(); 2136 2137 // Get the handle (the 2nd argument) 2138 __ mov(oop_handle_reg, c_rarg1); 2139 2140 // Get address of the box 2141 2142 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2143 2144 // Load the oop from the handle 2145 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2146 2147 if (!UseHeavyMonitors) { 2148 if (UseFastLocking) { 2149 // Load object header 2150 __ movptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2151 __ fast_lock_impl(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock); 2152 } else { 2153 // Load immediate 1 into swap_reg %rax 2154 __ movl(swap_reg, 1); 2155 2156 // Load (object->mark() | 1) into swap_reg %rax 2157 __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2158 2159 // Save (object->mark() | 1) into BasicLock's displaced header 2160 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2161 2162 // src -> dest iff dest == rax else rax <- dest 2163 __ lock(); 2164 __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2165 __ jcc(Assembler::equal, count_mon); 2166 2167 // Hmm should this move to the slow path code area??? 2168 2169 // Test if the oopMark is an obvious stack pointer, i.e., 2170 // 1) (mark & 3) == 0, and 2171 // 2) rsp <= mark < mark + os::pagesize() 2172 // These 3 tests can be done by evaluating the following 2173 // expression: ((mark - rsp) & (3 - os::vm_page_size())), 2174 // assuming both stack pointer and pagesize have their 2175 // least significant 2 bits clear. 2176 // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg 2177 2178 __ subptr(swap_reg, rsp); 2179 __ andptr(swap_reg, 3 - (int)os::vm_page_size()); 2180 2181 // Save the test result, for recursive case, the result is zero 2182 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2183 __ jcc(Assembler::notEqual, slow_path_lock); 2184 } 2185 } else { 2186 __ jmp(slow_path_lock); 2187 } 2188 __ bind(count_mon); 2189 __ inc_held_monitor_count(); 2190 2191 // Slow path will re-enter here 2192 __ bind(lock_done); 2193 } 2194 2195 // Finally just about ready to make the JNI call 2196 2197 // get JNIEnv* which is first argument to native 2198 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset()))); 2199 2200 // Now set thread in native 2201 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 2202 2203 __ call(RuntimeAddress(native_func)); 2204 2205 // Verify or restore cpu control state after JNI call 2206 __ restore_cpu_control_state_after_jni(rscratch1); 2207 2208 // Unpack native results. 2209 switch (ret_type) { 2210 case T_BOOLEAN: __ c2bool(rax); break; 2211 case T_CHAR : __ movzwl(rax, rax); break; 2212 case T_BYTE : __ sign_extend_byte (rax); break; 2213 case T_SHORT : __ sign_extend_short(rax); break; 2214 case T_INT : /* nothing to do */ break; 2215 case T_DOUBLE : 2216 case T_FLOAT : 2217 // Result is in xmm0 we'll save as needed 2218 break; 2219 case T_ARRAY: // Really a handle 2220 case T_OBJECT: // Really a handle 2221 break; // can't de-handlize until after safepoint check 2222 case T_VOID: break; 2223 case T_LONG: break; 2224 default : ShouldNotReachHere(); 2225 } 2226 2227 Label after_transition; 2228 2229 // Switch thread to "native transition" state before reading the synchronization state. 2230 // This additional state is necessary because reading and testing the synchronization 2231 // state is not atomic w.r.t. GC, as this scenario demonstrates: 2232 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted. 2233 // VM thread changes sync state to synchronizing and suspends threads for GC. 2234 // Thread A is resumed to finish this native method, but doesn't block here since it 2235 // didn't see any synchronization is progress, and escapes. 2236 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 2237 2238 // Force this write out before the read below 2239 if (!UseSystemMemoryBarrier) { 2240 __ membar(Assembler::Membar_mask_bits( 2241 Assembler::LoadLoad | Assembler::LoadStore | 2242 Assembler::StoreLoad | Assembler::StoreStore)); 2243 } 2244 2245 // check for safepoint operation in progress and/or pending suspend requests 2246 { 2247 Label Continue; 2248 Label slow_path; 2249 2250 __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */); 2251 2252 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2253 __ jcc(Assembler::equal, Continue); 2254 __ bind(slow_path); 2255 2256 // Don't use call_VM as it will see a possible pending exception and forward it 2257 // and never return here preventing us from clearing _last_native_pc down below. 2258 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are 2259 // preserved and correspond to the bcp/locals pointers. So we do a runtime call 2260 // by hand. 2261 // 2262 __ vzeroupper(); 2263 save_native_result(masm, ret_type, stack_slots); 2264 __ mov(c_rarg0, r15_thread); 2265 __ mov(r12, rsp); // remember sp 2266 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2267 __ andptr(rsp, -16); // align stack as required by ABI 2268 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 2269 __ mov(rsp, r12); // restore sp 2270 __ reinit_heapbase(); 2271 // Restore any method result value 2272 restore_native_result(masm, ret_type, stack_slots); 2273 __ bind(Continue); 2274 } 2275 2276 // change thread state 2277 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 2278 __ bind(after_transition); 2279 2280 Label reguard; 2281 Label reguard_done; 2282 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 2283 __ jcc(Assembler::equal, reguard); 2284 __ bind(reguard_done); 2285 2286 // native result if any is live 2287 2288 // Unlock 2289 Label slow_path_unlock; 2290 Label unlock_done; 2291 if (method->is_synchronized()) { 2292 2293 Label fast_done; 2294 2295 // Get locked oop from the handle we passed to jni 2296 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2297 2298 if (!UseHeavyMonitors && !UseFastLocking) { 2299 Label not_recur; 2300 // Simple recursive lock? 2301 __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD); 2302 __ jcc(Assembler::notEqual, not_recur); 2303 __ dec_held_monitor_count(); 2304 __ jmpb(fast_done); 2305 __ bind(not_recur); 2306 } 2307 2308 // Must save rax if it is live now because cmpxchg must use it 2309 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2310 save_native_result(masm, ret_type, stack_slots); 2311 } 2312 2313 if (!UseHeavyMonitors) { 2314 if (UseFastLocking) { 2315 __ movptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2316 __ andptr(swap_reg, ~(int32_t)markWord::lock_mask_in_place); 2317 __ fast_unlock_impl(obj_reg, swap_reg, lock_reg, slow_path_unlock); 2318 } else { 2319 // get address of the stack lock 2320 __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2321 // get old displaced header 2322 __ movptr(old_hdr, Address(rax, 0)); 2323 2324 // Atomic swap old header if oop still contains the stack lock 2325 __ lock(); 2326 __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2327 __ jcc(Assembler::notEqual, slow_path_unlock); 2328 } 2329 __ dec_held_monitor_count(); 2330 } else { 2331 __ jmp(slow_path_unlock); 2332 } 2333 2334 // slow path re-enters here 2335 __ bind(unlock_done); 2336 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2337 restore_native_result(masm, ret_type, stack_slots); 2338 } 2339 2340 __ bind(fast_done); 2341 } 2342 { 2343 SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1); 2344 save_native_result(masm, ret_type, stack_slots); 2345 __ mov_metadata(c_rarg1, method()); 2346 __ call_VM_leaf( 2347 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit), 2348 r15_thread, c_rarg1); 2349 restore_native_result(masm, ret_type, stack_slots); 2350 } 2351 2352 __ reset_last_Java_frame(false); 2353 2354 // Unbox oop result, e.g. JNIHandles::resolve value. 2355 if (is_reference_type(ret_type)) { 2356 __ resolve_jobject(rax /* value */, 2357 r15_thread /* thread */, 2358 rcx /* tmp */); 2359 } 2360 2361 if (CheckJNICalls) { 2362 // clear_pending_jni_exception_check 2363 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD); 2364 } 2365 2366 // reset handle block 2367 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset())); 2368 __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), NULL_WORD); 2369 2370 // pop our frame 2371 2372 __ leave(); 2373 2374 // Any exception pending? 2375 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2376 __ jcc(Assembler::notEqual, exception_pending); 2377 2378 // Return 2379 2380 __ ret(0); 2381 2382 // Unexpected paths are out of line and go here 2383 2384 // forward the exception 2385 __ bind(exception_pending); 2386 2387 // and forward the exception 2388 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 2389 2390 // Slow path locking & unlocking 2391 if (method->is_synchronized()) { 2392 2393 // BEGIN Slow path lock 2394 __ bind(slow_path_lock); 2395 2396 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM 2397 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2398 2399 // protect the args we've loaded 2400 save_args(masm, total_c_args, c_arg, out_regs); 2401 2402 __ mov(c_rarg0, obj_reg); 2403 __ mov(c_rarg1, lock_reg); 2404 __ mov(c_rarg2, r15_thread); 2405 2406 // Not a leaf but we have last_Java_frame setup as we want 2407 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3); 2408 restore_args(masm, total_c_args, c_arg, out_regs); 2409 2410 #ifdef ASSERT 2411 { Label L; 2412 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2413 __ jcc(Assembler::equal, L); 2414 __ stop("no pending exception allowed on exit from monitorenter"); 2415 __ bind(L); 2416 } 2417 #endif 2418 __ jmp(lock_done); 2419 2420 // END Slow path lock 2421 2422 // BEGIN Slow path unlock 2423 __ bind(slow_path_unlock); 2424 2425 // If we haven't already saved the native result we must save it now as xmm registers 2426 // are still exposed. 2427 __ vzeroupper(); 2428 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2429 save_native_result(masm, ret_type, stack_slots); 2430 } 2431 2432 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2433 2434 __ mov(c_rarg0, obj_reg); 2435 __ mov(c_rarg2, r15_thread); 2436 __ mov(r12, rsp); // remember sp 2437 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2438 __ andptr(rsp, -16); // align stack as required by ABI 2439 2440 // Save pending exception around call to VM (which contains an EXCEPTION_MARK) 2441 // NOTE that obj_reg == rbx currently 2442 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset()))); 2443 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2444 2445 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2446 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C))); 2447 __ mov(rsp, r12); // restore sp 2448 __ reinit_heapbase(); 2449 #ifdef ASSERT 2450 { 2451 Label L; 2452 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2453 __ jcc(Assembler::equal, L); 2454 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C"); 2455 __ bind(L); 2456 } 2457 #endif /* ASSERT */ 2458 2459 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx); 2460 2461 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2462 restore_native_result(masm, ret_type, stack_slots); 2463 } 2464 __ jmp(unlock_done); 2465 2466 // END Slow path unlock 2467 2468 } // synchronized 2469 2470 // SLOW PATH Reguard the stack if needed 2471 2472 __ bind(reguard); 2473 __ vzeroupper(); 2474 save_native_result(masm, ret_type, stack_slots); 2475 __ mov(r12, rsp); // remember sp 2476 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2477 __ andptr(rsp, -16); // align stack as required by ABI 2478 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 2479 __ mov(rsp, r12); // restore sp 2480 __ reinit_heapbase(); 2481 restore_native_result(masm, ret_type, stack_slots); 2482 // and continue 2483 __ jmp(reguard_done); 2484 2485 2486 2487 __ flush(); 2488 2489 nmethod *nm = nmethod::new_native_nmethod(method, 2490 compile_id, 2491 masm->code(), 2492 vep_offset, 2493 frame_complete, 2494 stack_slots / VMRegImpl::slots_per_word, 2495 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)), 2496 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size), 2497 oop_maps); 2498 2499 return nm; 2500 } 2501 2502 // this function returns the adjust size (in number of words) to a c2i adapter 2503 // activation for use during deoptimization 2504 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) { 2505 return (callee_locals - callee_parameters) * Interpreter::stackElementWords; 2506 } 2507 2508 2509 uint SharedRuntime::out_preserve_stack_slots() { 2510 return 0; 2511 } 2512 2513 2514 // Number of stack slots between incoming argument block and the start of 2515 // a new frame. The PROLOG must add this many slots to the stack. The 2516 // EPILOG must remove this many slots. amd64 needs two slots for 2517 // return address. 2518 uint SharedRuntime::in_preserve_stack_slots() { 2519 return 4 + 2 * VerifyStackAtCalls; 2520 } 2521 2522 //------------------------------generate_deopt_blob---------------------------- 2523 void SharedRuntime::generate_deopt_blob() { 2524 // Allocate space for the code 2525 ResourceMark rm; 2526 // Setup code generation tools 2527 int pad = 0; 2528 if (UseAVX > 2) { 2529 pad += 1024; 2530 } 2531 #if INCLUDE_JVMCI 2532 if (EnableJVMCI) { 2533 pad += 512; // Increase the buffer size when compiling for JVMCI 2534 } 2535 #endif 2536 CodeBuffer buffer("deopt_blob", 2560+pad, 1024); 2537 MacroAssembler* masm = new MacroAssembler(&buffer); 2538 int frame_size_in_words; 2539 OopMap* map = NULL; 2540 OopMapSet *oop_maps = new OopMapSet(); 2541 2542 // ------------- 2543 // This code enters when returning to a de-optimized nmethod. A return 2544 // address has been pushed on the stack, and return values are in 2545 // registers. 2546 // If we are doing a normal deopt then we were called from the patched 2547 // nmethod from the point we returned to the nmethod. So the return 2548 // address on the stack is wrong by NativeCall::instruction_size 2549 // We will adjust the value so it looks like we have the original return 2550 // address on the stack (like when we eagerly deoptimized). 2551 // In the case of an exception pending when deoptimizing, we enter 2552 // with a return address on the stack that points after the call we patched 2553 // into the exception handler. We have the following register state from, 2554 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp). 2555 // rax: exception oop 2556 // rbx: exception handler 2557 // rdx: throwing pc 2558 // So in this case we simply jam rdx into the useless return address and 2559 // the stack looks just like we want. 2560 // 2561 // At this point we need to de-opt. We save the argument return 2562 // registers. We call the first C routine, fetch_unroll_info(). This 2563 // routine captures the return values and returns a structure which 2564 // describes the current frame size and the sizes of all replacement frames. 2565 // The current frame is compiled code and may contain many inlined 2566 // functions, each with their own JVM state. We pop the current frame, then 2567 // push all the new frames. Then we call the C routine unpack_frames() to 2568 // populate these frames. Finally unpack_frames() returns us the new target 2569 // address. Notice that callee-save registers are BLOWN here; they have 2570 // already been captured in the vframeArray at the time the return PC was 2571 // patched. 2572 address start = __ pc(); 2573 Label cont; 2574 2575 // Prolog for non exception case! 2576 2577 // Save everything in sight. 2578 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2579 2580 // Normal deoptimization. Save exec mode for unpack_frames. 2581 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved 2582 __ jmp(cont); 2583 2584 int reexecute_offset = __ pc() - start; 2585 #if INCLUDE_JVMCI && !defined(COMPILER1) 2586 if (EnableJVMCI && UseJVMCICompiler) { 2587 // JVMCI does not use this kind of deoptimization 2588 __ should_not_reach_here(); 2589 } 2590 #endif 2591 2592 // Reexecute case 2593 // return address is the pc describes what bci to do re-execute at 2594 2595 // No need to update map as each call to save_live_registers will produce identical oopmap 2596 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2597 2598 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved 2599 __ jmp(cont); 2600 2601 #if INCLUDE_JVMCI 2602 Label after_fetch_unroll_info_call; 2603 int implicit_exception_uncommon_trap_offset = 0; 2604 int uncommon_trap_offset = 0; 2605 2606 if (EnableJVMCI) { 2607 implicit_exception_uncommon_trap_offset = __ pc() - start; 2608 2609 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset()))); 2610 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD); 2611 2612 uncommon_trap_offset = __ pc() - start; 2613 2614 // Save everything in sight. 2615 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2616 // fetch_unroll_info needs to call last_java_frame() 2617 __ set_last_Java_frame(noreg, noreg, NULL, rscratch1); 2618 2619 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset()))); 2620 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1); 2621 2622 __ movl(r14, Deoptimization::Unpack_reexecute); 2623 __ mov(c_rarg0, r15_thread); 2624 __ movl(c_rarg2, r14); // exec mode 2625 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2626 oop_maps->add_gc_map( __ pc()-start, map->deep_copy()); 2627 2628 __ reset_last_Java_frame(false); 2629 2630 __ jmp(after_fetch_unroll_info_call); 2631 } // EnableJVMCI 2632 #endif // INCLUDE_JVMCI 2633 2634 int exception_offset = __ pc() - start; 2635 2636 // Prolog for exception case 2637 2638 // all registers are dead at this entry point, except for rax, and 2639 // rdx which contain the exception oop and exception pc 2640 // respectively. Set them in TLS and fall thru to the 2641 // unpack_with_exception_in_tls entry point. 2642 2643 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 2644 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax); 2645 2646 int exception_in_tls_offset = __ pc() - start; 2647 2648 // new implementation because exception oop is now passed in JavaThread 2649 2650 // Prolog for exception case 2651 // All registers must be preserved because they might be used by LinearScan 2652 // Exceptiop oop and throwing PC are passed in JavaThread 2653 // tos: stack at point of call to method that threw the exception (i.e. only 2654 // args are on the stack, no return address) 2655 2656 // make room on stack for the return address 2657 // It will be patched later with the throwing pc. The correct value is not 2658 // available now because loading it from memory would destroy registers. 2659 __ push(0); 2660 2661 // Save everything in sight. 2662 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2663 2664 // Now it is safe to overwrite any register 2665 2666 // Deopt during an exception. Save exec mode for unpack_frames. 2667 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved 2668 2669 // load throwing pc from JavaThread and patch it as the return address 2670 // of the current frame. Then clear the field in JavaThread 2671 2672 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2673 __ movptr(Address(rbp, wordSize), rdx); 2674 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2675 2676 #ifdef ASSERT 2677 // verify that there is really an exception oop in JavaThread 2678 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2679 __ verify_oop(rax); 2680 2681 // verify that there is no pending exception 2682 Label no_pending_exception; 2683 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 2684 __ testptr(rax, rax); 2685 __ jcc(Assembler::zero, no_pending_exception); 2686 __ stop("must not have pending exception here"); 2687 __ bind(no_pending_exception); 2688 #endif 2689 2690 __ bind(cont); 2691 2692 // Call C code. Need thread and this frame, but NOT official VM entry 2693 // crud. We cannot block on this call, no GC can happen. 2694 // 2695 // UnrollBlock* fetch_unroll_info(JavaThread* thread) 2696 2697 // fetch_unroll_info needs to call last_java_frame(). 2698 2699 __ set_last_Java_frame(noreg, noreg, NULL, rscratch1); 2700 #ifdef ASSERT 2701 { Label L; 2702 __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD); 2703 __ jcc(Assembler::equal, L); 2704 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared"); 2705 __ bind(L); 2706 } 2707 #endif // ASSERT 2708 __ mov(c_rarg0, r15_thread); 2709 __ movl(c_rarg1, r14); // exec_mode 2710 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info))); 2711 2712 // Need to have an oopmap that tells fetch_unroll_info where to 2713 // find any register it might need. 2714 oop_maps->add_gc_map(__ pc() - start, map); 2715 2716 __ reset_last_Java_frame(false); 2717 2718 #if INCLUDE_JVMCI 2719 if (EnableJVMCI) { 2720 __ bind(after_fetch_unroll_info_call); 2721 } 2722 #endif 2723 2724 // Load UnrollBlock* into rdi 2725 __ mov(rdi, rax); 2726 2727 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes())); 2728 Label noException; 2729 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending? 2730 __ jcc(Assembler::notEqual, noException); 2731 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2732 // QQQ this is useless it was NULL above 2733 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2734 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD); 2735 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2736 2737 __ verify_oop(rax); 2738 2739 // Overwrite the result registers with the exception results. 2740 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2741 // I think this is useless 2742 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx); 2743 2744 __ bind(noException); 2745 2746 // Only register save data is on the stack. 2747 // Now restore the result registers. Everything else is either dead 2748 // or captured in the vframeArray. 2749 RegisterSaver::restore_result_registers(masm); 2750 2751 // All of the register save area has been popped of the stack. Only the 2752 // return address remains. 2753 2754 // Pop all the frames we must move/replace. 2755 // 2756 // Frame picture (youngest to oldest) 2757 // 1: self-frame (no frame link) 2758 // 2: deopting frame (no frame link) 2759 // 3: caller of deopting frame (could be compiled/interpreted). 2760 // 2761 // Note: by leaving the return address of self-frame on the stack 2762 // and using the size of frame 2 to adjust the stack 2763 // when we are done the return to frame 3 will still be on the stack. 2764 2765 // Pop deoptimized frame 2766 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes())); 2767 __ addptr(rsp, rcx); 2768 2769 // rsp should be pointing at the return address to the caller (3) 2770 2771 // Pick up the initial fp we should save 2772 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2773 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes())); 2774 2775 #ifdef ASSERT 2776 // Compilers generate code that bang the stack by as much as the 2777 // interpreter would need. So this stack banging should never 2778 // trigger a fault. Verify that it does not on non product builds. 2779 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes())); 2780 __ bang_stack_size(rbx, rcx); 2781 #endif 2782 2783 // Load address of array of frame pcs into rcx 2784 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes())); 2785 2786 // Trash the old pc 2787 __ addptr(rsp, wordSize); 2788 2789 // Load address of array of frame sizes into rsi 2790 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes())); 2791 2792 // Load counter into rdx 2793 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes())); 2794 2795 // Now adjust the caller's stack to make up for the extra locals 2796 // but record the original sp so that we can save it in the skeletal interpreter 2797 // frame and the stack walking of interpreter_sender will get the unextended sp 2798 // value and not the "real" sp value. 2799 2800 const Register sender_sp = r8; 2801 2802 __ mov(sender_sp, rsp); 2803 __ movl(rbx, Address(rdi, 2804 Deoptimization::UnrollBlock:: 2805 caller_adjustment_offset_in_bytes())); 2806 __ subptr(rsp, rbx); 2807 2808 // Push interpreter frames in a loop 2809 Label loop; 2810 __ bind(loop); 2811 __ movptr(rbx, Address(rsi, 0)); // Load frame size 2812 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand 2813 __ pushptr(Address(rcx, 0)); // Save return address 2814 __ enter(); // Save old & set new ebp 2815 __ subptr(rsp, rbx); // Prolog 2816 // This value is corrected by layout_activation_impl 2817 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD); 2818 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable 2819 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 2820 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 2821 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 2822 __ decrementl(rdx); // Decrement counter 2823 __ jcc(Assembler::notZero, loop); 2824 __ pushptr(Address(rcx, 0)); // Save final return address 2825 2826 // Re-push self-frame 2827 __ enter(); // Save old & set new ebp 2828 2829 // Allocate a full sized register save area. 2830 // Return address and rbp are in place, so we allocate two less words. 2831 __ subptr(rsp, (frame_size_in_words - 2) * wordSize); 2832 2833 // Restore frame locals after moving the frame 2834 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0); 2835 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2836 2837 // Call C code. Need thread but NOT official VM entry 2838 // crud. We cannot block on this call, no GC can happen. Call should 2839 // restore return values to their stack-slots with the new SP. 2840 // 2841 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode) 2842 2843 // Use rbp because the frames look interpreted now 2844 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 2845 // Don't need the precise return PC here, just precise enough to point into this code blob. 2846 address the_pc = __ pc(); 2847 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1); 2848 2849 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI 2850 __ mov(c_rarg0, r15_thread); 2851 __ movl(c_rarg1, r14); // second arg: exec_mode 2852 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 2853 // Revert SP alignment after call since we're going to do some SP relative addressing below 2854 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset())); 2855 2856 // Set an oopmap for the call site 2857 // Use the same PC we used for the last java frame 2858 oop_maps->add_gc_map(the_pc - start, 2859 new OopMap( frame_size_in_words, 0 )); 2860 2861 // Clear fp AND pc 2862 __ reset_last_Java_frame(true); 2863 2864 // Collect return values 2865 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes())); 2866 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes())); 2867 // I think this is useless (throwing pc?) 2868 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes())); 2869 2870 // Pop self-frame. 2871 __ leave(); // Epilog 2872 2873 // Jump to interpreter 2874 __ ret(0); 2875 2876 // Make sure all code is generated 2877 masm->flush(); 2878 2879 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words); 2880 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset); 2881 #if INCLUDE_JVMCI 2882 if (EnableJVMCI) { 2883 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset); 2884 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset); 2885 } 2886 #endif 2887 } 2888 2889 #ifdef COMPILER2 2890 //------------------------------generate_uncommon_trap_blob-------------------- 2891 void SharedRuntime::generate_uncommon_trap_blob() { 2892 // Allocate space for the code 2893 ResourceMark rm; 2894 // Setup code generation tools 2895 CodeBuffer buffer("uncommon_trap_blob", 2048, 1024); 2896 MacroAssembler* masm = new MacroAssembler(&buffer); 2897 2898 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 2899 2900 address start = __ pc(); 2901 2902 if (UseRTMLocking) { 2903 // Abort RTM transaction before possible nmethod deoptimization. 2904 __ xabort(0); 2905 } 2906 2907 // Push self-frame. We get here with a return address on the 2908 // stack, so rsp is 8-byte aligned until we allocate our frame. 2909 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog! 2910 2911 // No callee saved registers. rbp is assumed implicitly saved 2912 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 2913 2914 // compiler left unloaded_class_index in j_rarg0 move to where the 2915 // runtime expects it. 2916 __ movl(c_rarg1, j_rarg0); 2917 2918 __ set_last_Java_frame(noreg, noreg, NULL, rscratch1); 2919 2920 // Call C code. Need thread but NOT official VM entry 2921 // crud. We cannot block on this call, no GC can happen. Call should 2922 // capture callee-saved registers as well as return values. 2923 // Thread is in rdi already. 2924 // 2925 // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index); 2926 2927 __ mov(c_rarg0, r15_thread); 2928 __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap); 2929 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2930 2931 // Set an oopmap for the call site 2932 OopMapSet* oop_maps = new OopMapSet(); 2933 OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0); 2934 2935 // location of rbp is known implicitly by the frame sender code 2936 2937 oop_maps->add_gc_map(__ pc() - start, map); 2938 2939 __ reset_last_Java_frame(false); 2940 2941 // Load UnrollBlock* into rdi 2942 __ mov(rdi, rax); 2943 2944 #ifdef ASSERT 2945 { Label L; 2946 __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()), 2947 Deoptimization::Unpack_uncommon_trap); 2948 __ jcc(Assembler::equal, L); 2949 __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap"); 2950 __ bind(L); 2951 } 2952 #endif 2953 2954 // Pop all the frames we must move/replace. 2955 // 2956 // Frame picture (youngest to oldest) 2957 // 1: self-frame (no frame link) 2958 // 2: deopting frame (no frame link) 2959 // 3: caller of deopting frame (could be compiled/interpreted). 2960 2961 // Pop self-frame. We have no frame, and must rely only on rax and rsp. 2962 __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog! 2963 2964 // Pop deoptimized frame (int) 2965 __ movl(rcx, Address(rdi, 2966 Deoptimization::UnrollBlock:: 2967 size_of_deoptimized_frame_offset_in_bytes())); 2968 __ addptr(rsp, rcx); 2969 2970 // rsp should be pointing at the return address to the caller (3) 2971 2972 // Pick up the initial fp we should save 2973 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2974 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes())); 2975 2976 #ifdef ASSERT 2977 // Compilers generate code that bang the stack by as much as the 2978 // interpreter would need. So this stack banging should never 2979 // trigger a fault. Verify that it does not on non product builds. 2980 __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes())); 2981 __ bang_stack_size(rbx, rcx); 2982 #endif 2983 2984 // Load address of array of frame pcs into rcx (address*) 2985 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes())); 2986 2987 // Trash the return pc 2988 __ addptr(rsp, wordSize); 2989 2990 // Load address of array of frame sizes into rsi (intptr_t*) 2991 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes())); 2992 2993 // Counter 2994 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int) 2995 2996 // Now adjust the caller's stack to make up for the extra locals but 2997 // record the original sp so that we can save it in the skeletal 2998 // interpreter frame and the stack walking of interpreter_sender 2999 // will get the unextended sp value and not the "real" sp value. 3000 3001 const Register sender_sp = r8; 3002 3003 __ mov(sender_sp, rsp); 3004 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int) 3005 __ subptr(rsp, rbx); 3006 3007 // Push interpreter frames in a loop 3008 Label loop; 3009 __ bind(loop); 3010 __ movptr(rbx, Address(rsi, 0)); // Load frame size 3011 __ subptr(rbx, 2 * wordSize); // We'll push pc and rbp by hand 3012 __ pushptr(Address(rcx, 0)); // Save return address 3013 __ enter(); // Save old & set new rbp 3014 __ subptr(rsp, rbx); // Prolog 3015 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), 3016 sender_sp); // Make it walkable 3017 // This value is corrected by layout_activation_impl 3018 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD); 3019 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 3020 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 3021 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 3022 __ decrementl(rdx); // Decrement counter 3023 __ jcc(Assembler::notZero, loop); 3024 __ pushptr(Address(rcx, 0)); // Save final return address 3025 3026 // Re-push self-frame 3027 __ enter(); // Save old & set new rbp 3028 __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt); 3029 // Prolog 3030 3031 // Use rbp because the frames look interpreted now 3032 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 3033 // Don't need the precise return PC here, just precise enough to point into this code blob. 3034 address the_pc = __ pc(); 3035 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1); 3036 3037 // Call C code. Need thread but NOT official VM entry 3038 // crud. We cannot block on this call, no GC can happen. Call should 3039 // restore return values to their stack-slots with the new SP. 3040 // Thread is in rdi already. 3041 // 3042 // BasicType unpack_frames(JavaThread* thread, int exec_mode); 3043 3044 __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI 3045 __ mov(c_rarg0, r15_thread); 3046 __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap); 3047 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 3048 3049 // Set an oopmap for the call site 3050 // Use the same PC we used for the last java frame 3051 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 3052 3053 // Clear fp AND pc 3054 __ reset_last_Java_frame(true); 3055 3056 // Pop self-frame. 3057 __ leave(); // Epilog 3058 3059 // Jump to interpreter 3060 __ ret(0); 3061 3062 // Make sure all code is generated 3063 masm->flush(); 3064 3065 _uncommon_trap_blob = UncommonTrapBlob::create(&buffer, oop_maps, 3066 SimpleRuntimeFrame::framesize >> 1); 3067 } 3068 #endif // COMPILER2 3069 3070 //------------------------------generate_handler_blob------ 3071 // 3072 // Generate a special Compile2Runtime blob that saves all registers, 3073 // and setup oopmap. 3074 // 3075 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) { 3076 assert(StubRoutines::forward_exception_entry() != NULL, 3077 "must be generated before"); 3078 3079 ResourceMark rm; 3080 OopMapSet *oop_maps = new OopMapSet(); 3081 OopMap* map; 3082 3083 // Allocate space for the code. Setup code generation tools. 3084 CodeBuffer buffer("handler_blob", 2048, 1024); 3085 MacroAssembler* masm = new MacroAssembler(&buffer); 3086 3087 address start = __ pc(); 3088 address call_pc = NULL; 3089 int frame_size_in_words; 3090 bool cause_return = (poll_type == POLL_AT_RETURN); 3091 bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP); 3092 3093 if (UseRTMLocking) { 3094 // Abort RTM transaction before calling runtime 3095 // because critical section will be large and will be 3096 // aborted anyway. Also nmethod could be deoptimized. 3097 __ xabort(0); 3098 } 3099 3100 // Make room for return address (or push it again) 3101 if (!cause_return) { 3102 __ push(rbx); 3103 } 3104 3105 // Save registers, fpu state, and flags 3106 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors); 3107 3108 // The following is basically a call_VM. However, we need the precise 3109 // address of the call in order to generate an oopmap. Hence, we do all the 3110 // work ourselves. 3111 3112 __ set_last_Java_frame(noreg, noreg, NULL, rscratch1); // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next: 3113 3114 // The return address must always be correct so that frame constructor never 3115 // sees an invalid pc. 3116 3117 if (!cause_return) { 3118 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack. 3119 // Additionally, rbx is a callee saved register and we can look at it later to determine 3120 // if someone changed the return address for us! 3121 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset())); 3122 __ movptr(Address(rbp, wordSize), rbx); 3123 } 3124 3125 // Do the call 3126 __ mov(c_rarg0, r15_thread); 3127 __ call(RuntimeAddress(call_ptr)); 3128 3129 // Set an oopmap for the call site. This oopmap will map all 3130 // oop-registers and debug-info registers as callee-saved. This 3131 // will allow deoptimization at this safepoint to find all possible 3132 // debug-info recordings, as well as let GC find all oops. 3133 3134 oop_maps->add_gc_map( __ pc() - start, map); 3135 3136 Label noException; 3137 3138 __ reset_last_Java_frame(false); 3139 3140 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3141 __ jcc(Assembler::equal, noException); 3142 3143 // Exception pending 3144 3145 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3146 3147 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3148 3149 // No exception case 3150 __ bind(noException); 3151 3152 Label no_adjust; 3153 #ifdef ASSERT 3154 Label bail; 3155 #endif 3156 if (!cause_return) { 3157 Label no_prefix, not_special; 3158 3159 // If our stashed return pc was modified by the runtime we avoid touching it 3160 __ cmpptr(rbx, Address(rbp, wordSize)); 3161 __ jccb(Assembler::notEqual, no_adjust); 3162 3163 // Skip over the poll instruction. 3164 // See NativeInstruction::is_safepoint_poll() 3165 // Possible encodings: 3166 // 85 00 test %eax,(%rax) 3167 // 85 01 test %eax,(%rcx) 3168 // 85 02 test %eax,(%rdx) 3169 // 85 03 test %eax,(%rbx) 3170 // 85 06 test %eax,(%rsi) 3171 // 85 07 test %eax,(%rdi) 3172 // 3173 // 41 85 00 test %eax,(%r8) 3174 // 41 85 01 test %eax,(%r9) 3175 // 41 85 02 test %eax,(%r10) 3176 // 41 85 03 test %eax,(%r11) 3177 // 41 85 06 test %eax,(%r14) 3178 // 41 85 07 test %eax,(%r15) 3179 // 3180 // 85 04 24 test %eax,(%rsp) 3181 // 41 85 04 24 test %eax,(%r12) 3182 // 85 45 00 test %eax,0x0(%rbp) 3183 // 41 85 45 00 test %eax,0x0(%r13) 3184 3185 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix); 3186 __ jcc(Assembler::notEqual, no_prefix); 3187 __ addptr(rbx, 1); 3188 __ bind(no_prefix); 3189 #ifdef ASSERT 3190 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below 3191 #endif 3192 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values: 3193 // r12/rsp 0x04 3194 // r13/rbp 0x05 3195 __ movzbq(rcx, Address(rbx, 1)); 3196 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05 3197 __ subptr(rcx, 4); // looking for 0x00 .. 0x01 3198 __ cmpptr(rcx, 1); 3199 __ jcc(Assembler::above, not_special); 3200 __ addptr(rbx, 1); 3201 __ bind(not_special); 3202 #ifdef ASSERT 3203 // Verify the correct encoding of the poll we're about to skip. 3204 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl); 3205 __ jcc(Assembler::notEqual, bail); 3206 // Mask out the modrm bits 3207 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask); 3208 // rax encodes to 0, so if the bits are nonzero it's incorrect 3209 __ jcc(Assembler::notZero, bail); 3210 #endif 3211 // Adjust return pc forward to step over the safepoint poll instruction 3212 __ addptr(rbx, 2); 3213 __ movptr(Address(rbp, wordSize), rbx); 3214 } 3215 3216 __ bind(no_adjust); 3217 // Normal exit, restore registers and exit. 3218 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3219 __ ret(0); 3220 3221 #ifdef ASSERT 3222 __ bind(bail); 3223 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected"); 3224 #endif 3225 3226 // Make sure all code is generated 3227 masm->flush(); 3228 3229 // Fill-out other meta info 3230 return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words); 3231 } 3232 3233 // 3234 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss 3235 // 3236 // Generate a stub that calls into vm to find out the proper destination 3237 // of a java call. All the argument registers are live at this point 3238 // but since this is generic code we don't know what they are and the caller 3239 // must do any gc of the args. 3240 // 3241 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) { 3242 assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before"); 3243 3244 // allocate space for the code 3245 ResourceMark rm; 3246 3247 CodeBuffer buffer(name, 1200, 512); 3248 MacroAssembler* masm = new MacroAssembler(&buffer); 3249 3250 int frame_size_in_words; 3251 3252 OopMapSet *oop_maps = new OopMapSet(); 3253 OopMap* map = NULL; 3254 3255 int start = __ offset(); 3256 3257 // No need to save vector registers since they are caller-saved anyway. 3258 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false); 3259 3260 int frame_complete = __ offset(); 3261 3262 __ set_last_Java_frame(noreg, noreg, NULL, rscratch1); 3263 3264 __ mov(c_rarg0, r15_thread); 3265 3266 __ call(RuntimeAddress(destination)); 3267 3268 3269 // Set an oopmap for the call site. 3270 // We need this not only for callee-saved registers, but also for volatile 3271 // registers that the compiler might be keeping live across a safepoint. 3272 3273 oop_maps->add_gc_map( __ offset() - start, map); 3274 3275 // rax contains the address we are going to jump to assuming no exception got installed 3276 3277 // clear last_Java_sp 3278 __ reset_last_Java_frame(false); 3279 // check for pending exceptions 3280 Label pending; 3281 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3282 __ jcc(Assembler::notEqual, pending); 3283 3284 // get the returned Method* 3285 __ get_vm_result_2(rbx, r15_thread); 3286 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx); 3287 3288 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 3289 3290 RegisterSaver::restore_live_registers(masm); 3291 3292 // We are back to the original state on entry and ready to go. 3293 3294 __ jmp(rax); 3295 3296 // Pending exception after the safepoint 3297 3298 __ bind(pending); 3299 3300 RegisterSaver::restore_live_registers(masm); 3301 3302 // exception pending => remove activation and forward to exception handler 3303 3304 __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD); 3305 3306 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 3307 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3308 3309 // ------------- 3310 // make sure all code is generated 3311 masm->flush(); 3312 3313 // return the blob 3314 // frame_size_words or bytes?? 3315 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); 3316 } 3317 3318 //------------------------------Montgomery multiplication------------------------ 3319 // 3320 3321 #ifndef _WINDOWS 3322 3323 // Subtract 0:b from carry:a. Return carry. 3324 static julong 3325 sub(julong a[], julong b[], julong carry, long len) { 3326 long long i = 0, cnt = len; 3327 julong tmp; 3328 asm volatile("clc; " 3329 "0: ; " 3330 "mov (%[b], %[i], 8), %[tmp]; " 3331 "sbb %[tmp], (%[a], %[i], 8); " 3332 "inc %[i]; dec %[cnt]; " 3333 "jne 0b; " 3334 "mov %[carry], %[tmp]; sbb $0, %[tmp]; " 3335 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp) 3336 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry) 3337 : "memory"); 3338 return tmp; 3339 } 3340 3341 // Multiply (unsigned) Long A by Long B, accumulating the double- 3342 // length result into the accumulator formed of T0, T1, and T2. 3343 #define MACC(A, B, T0, T1, T2) \ 3344 do { \ 3345 unsigned long hi, lo; \ 3346 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3347 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3348 : "r"(A), "a"(B) : "cc"); \ 3349 } while(0) 3350 3351 // As above, but add twice the double-length result into the 3352 // accumulator. 3353 #define MACC2(A, B, T0, T1, T2) \ 3354 do { \ 3355 unsigned long hi, lo; \ 3356 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \ 3357 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3358 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3359 : "r"(A), "a"(B) : "cc"); \ 3360 } while(0) 3361 3362 #else //_WINDOWS 3363 3364 static julong 3365 sub(julong a[], julong b[], julong carry, long len) { 3366 long i; 3367 julong tmp; 3368 unsigned char c = 1; 3369 for (i = 0; i < len; i++) { 3370 c = _addcarry_u64(c, a[i], ~b[i], &tmp); 3371 a[i] = tmp; 3372 } 3373 c = _addcarry_u64(c, carry, ~0, &tmp); 3374 return tmp; 3375 } 3376 3377 // Multiply (unsigned) Long A by Long B, accumulating the double- 3378 // length result into the accumulator formed of T0, T1, and T2. 3379 #define MACC(A, B, T0, T1, T2) \ 3380 do { \ 3381 julong hi, lo; \ 3382 lo = _umul128(A, B, &hi); \ 3383 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3384 c = _addcarry_u64(c, hi, T1, &T1); \ 3385 _addcarry_u64(c, T2, 0, &T2); \ 3386 } while(0) 3387 3388 // As above, but add twice the double-length result into the 3389 // accumulator. 3390 #define MACC2(A, B, T0, T1, T2) \ 3391 do { \ 3392 julong hi, lo; \ 3393 lo = _umul128(A, B, &hi); \ 3394 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3395 c = _addcarry_u64(c, hi, T1, &T1); \ 3396 _addcarry_u64(c, T2, 0, &T2); \ 3397 c = _addcarry_u64(0, lo, T0, &T0); \ 3398 c = _addcarry_u64(c, hi, T1, &T1); \ 3399 _addcarry_u64(c, T2, 0, &T2); \ 3400 } while(0) 3401 3402 #endif //_WINDOWS 3403 3404 // Fast Montgomery multiplication. The derivation of the algorithm is 3405 // in A Cryptographic Library for the Motorola DSP56000, 3406 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3407 3408 static void NOINLINE 3409 montgomery_multiply(julong a[], julong b[], julong n[], 3410 julong m[], julong inv, int len) { 3411 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3412 int i; 3413 3414 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply"); 3415 3416 for (i = 0; i < len; i++) { 3417 int j; 3418 for (j = 0; j < i; j++) { 3419 MACC(a[j], b[i-j], t0, t1, t2); 3420 MACC(m[j], n[i-j], t0, t1, t2); 3421 } 3422 MACC(a[i], b[0], t0, t1, t2); 3423 m[i] = t0 * inv; 3424 MACC(m[i], n[0], t0, t1, t2); 3425 3426 assert(t0 == 0, "broken Montgomery multiply"); 3427 3428 t0 = t1; t1 = t2; t2 = 0; 3429 } 3430 3431 for (i = len; i < 2*len; i++) { 3432 int j; 3433 for (j = i-len+1; j < len; j++) { 3434 MACC(a[j], b[i-j], t0, t1, t2); 3435 MACC(m[j], n[i-j], t0, t1, t2); 3436 } 3437 m[i-len] = t0; 3438 t0 = t1; t1 = t2; t2 = 0; 3439 } 3440 3441 while (t0) 3442 t0 = sub(m, n, t0, len); 3443 } 3444 3445 // Fast Montgomery squaring. This uses asymptotically 25% fewer 3446 // multiplies so it should be up to 25% faster than Montgomery 3447 // multiplication. However, its loop control is more complex and it 3448 // may actually run slower on some machines. 3449 3450 static void NOINLINE 3451 montgomery_square(julong a[], julong n[], 3452 julong m[], julong inv, int len) { 3453 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3454 int i; 3455 3456 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square"); 3457 3458 for (i = 0; i < len; i++) { 3459 int j; 3460 int end = (i+1)/2; 3461 for (j = 0; j < end; j++) { 3462 MACC2(a[j], a[i-j], t0, t1, t2); 3463 MACC(m[j], n[i-j], t0, t1, t2); 3464 } 3465 if ((i & 1) == 0) { 3466 MACC(a[j], a[j], t0, t1, t2); 3467 } 3468 for (; j < i; j++) { 3469 MACC(m[j], n[i-j], t0, t1, t2); 3470 } 3471 m[i] = t0 * inv; 3472 MACC(m[i], n[0], t0, t1, t2); 3473 3474 assert(t0 == 0, "broken Montgomery square"); 3475 3476 t0 = t1; t1 = t2; t2 = 0; 3477 } 3478 3479 for (i = len; i < 2*len; i++) { 3480 int start = i-len+1; 3481 int end = start + (len - start)/2; 3482 int j; 3483 for (j = start; j < end; j++) { 3484 MACC2(a[j], a[i-j], t0, t1, t2); 3485 MACC(m[j], n[i-j], t0, t1, t2); 3486 } 3487 if ((i & 1) == 0) { 3488 MACC(a[j], a[j], t0, t1, t2); 3489 } 3490 for (; j < len; j++) { 3491 MACC(m[j], n[i-j], t0, t1, t2); 3492 } 3493 m[i-len] = t0; 3494 t0 = t1; t1 = t2; t2 = 0; 3495 } 3496 3497 while (t0) 3498 t0 = sub(m, n, t0, len); 3499 } 3500 3501 // Swap words in a longword. 3502 static julong swap(julong x) { 3503 return (x << 32) | (x >> 32); 3504 } 3505 3506 // Copy len longwords from s to d, word-swapping as we go. The 3507 // destination array is reversed. 3508 static void reverse_words(julong *s, julong *d, int len) { 3509 d += len; 3510 while(len-- > 0) { 3511 d--; 3512 *d = swap(*s); 3513 s++; 3514 } 3515 } 3516 3517 // The threshold at which squaring is advantageous was determined 3518 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. 3519 #define MONTGOMERY_SQUARING_THRESHOLD 64 3520 3521 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, 3522 jint len, jlong inv, 3523 jint *m_ints) { 3524 assert(len % 2 == 0, "array length in montgomery_multiply must be even"); 3525 int longwords = len/2; 3526 3527 // Make very sure we don't use so much space that the stack might 3528 // overflow. 512 jints corresponds to an 16384-bit integer and 3529 // will use here a total of 8k bytes of stack space. 3530 int divisor = sizeof(julong) * 4; 3531 guarantee(longwords <= 8192 / divisor, "must be"); 3532 int total_allocation = longwords * sizeof (julong) * 4; 3533 julong *scratch = (julong *)alloca(total_allocation); 3534 3535 // Local scratch arrays 3536 julong 3537 *a = scratch + 0 * longwords, 3538 *b = scratch + 1 * longwords, 3539 *n = scratch + 2 * longwords, 3540 *m = scratch + 3 * longwords; 3541 3542 reverse_words((julong *)a_ints, a, longwords); 3543 reverse_words((julong *)b_ints, b, longwords); 3544 reverse_words((julong *)n_ints, n, longwords); 3545 3546 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords); 3547 3548 reverse_words(m, (julong *)m_ints, longwords); 3549 } 3550 3551 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, 3552 jint len, jlong inv, 3553 jint *m_ints) { 3554 assert(len % 2 == 0, "array length in montgomery_square must be even"); 3555 int longwords = len/2; 3556 3557 // Make very sure we don't use so much space that the stack might 3558 // overflow. 512 jints corresponds to an 16384-bit integer and 3559 // will use here a total of 6k bytes of stack space. 3560 int divisor = sizeof(julong) * 3; 3561 guarantee(longwords <= (8192 / divisor), "must be"); 3562 int total_allocation = longwords * sizeof (julong) * 3; 3563 julong *scratch = (julong *)alloca(total_allocation); 3564 3565 // Local scratch arrays 3566 julong 3567 *a = scratch + 0 * longwords, 3568 *n = scratch + 1 * longwords, 3569 *m = scratch + 2 * longwords; 3570 3571 reverse_words((julong *)a_ints, a, longwords); 3572 reverse_words((julong *)n_ints, n, longwords); 3573 3574 if (len >= MONTGOMERY_SQUARING_THRESHOLD) { 3575 ::montgomery_square(a, n, m, (julong)inv, longwords); 3576 } else { 3577 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords); 3578 } 3579 3580 reverse_words(m, (julong *)m_ints, longwords); 3581 } 3582 3583 #ifdef COMPILER2 3584 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame 3585 // 3586 //------------------------------generate_exception_blob--------------------------- 3587 // creates exception blob at the end 3588 // Using exception blob, this code is jumped from a compiled method. 3589 // (see emit_exception_handler in x86_64.ad file) 3590 // 3591 // Given an exception pc at a call we call into the runtime for the 3592 // handler in this method. This handler might merely restore state 3593 // (i.e. callee save registers) unwind the frame and jump to the 3594 // exception handler for the nmethod if there is no Java level handler 3595 // for the nmethod. 3596 // 3597 // This code is entered with a jmp. 3598 // 3599 // Arguments: 3600 // rax: exception oop 3601 // rdx: exception pc 3602 // 3603 // Results: 3604 // rax: exception oop 3605 // rdx: exception pc in caller or ??? 3606 // destination: exception handler of caller 3607 // 3608 // Note: the exception pc MUST be at a call (precise debug information) 3609 // Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved. 3610 // 3611 3612 void OptoRuntime::generate_exception_blob() { 3613 assert(!OptoRuntime::is_callee_saved_register(RDX_num), ""); 3614 assert(!OptoRuntime::is_callee_saved_register(RAX_num), ""); 3615 assert(!OptoRuntime::is_callee_saved_register(RCX_num), ""); 3616 3617 assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned"); 3618 3619 // Allocate space for the code 3620 ResourceMark rm; 3621 // Setup code generation tools 3622 CodeBuffer buffer("exception_blob", 2048, 1024); 3623 MacroAssembler* masm = new MacroAssembler(&buffer); 3624 3625 3626 address start = __ pc(); 3627 3628 // Exception pc is 'return address' for stack walker 3629 __ push(rdx); 3630 __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog 3631 3632 // Save callee-saved registers. See x86_64.ad. 3633 3634 // rbp is an implicitly saved callee saved register (i.e., the calling 3635 // convention will save/restore it in the prolog/epilog). Other than that 3636 // there are no callee save registers now that adapter frames are gone. 3637 3638 __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp); 3639 3640 // Store exception in Thread object. We cannot pass any arguments to the 3641 // handle_exception call, since we do not want to make any assumption 3642 // about the size of the frame where the exception happened in. 3643 // c_rarg0 is either rdi (Linux) or rcx (Windows). 3644 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax); 3645 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 3646 3647 // This call does all the hard work. It checks if an exception handler 3648 // exists in the method. 3649 // If so, it returns the handler address. 3650 // If not, it prepares for stack-unwinding, restoring the callee-save 3651 // registers of the frame being removed. 3652 // 3653 // address OptoRuntime::handle_exception_C(JavaThread* thread) 3654 3655 // At a method handle call, the stack may not be properly aligned 3656 // when returning with an exception. 3657 address the_pc = __ pc(); 3658 __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1); 3659 __ mov(c_rarg0, r15_thread); 3660 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack 3661 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C))); 3662 3663 // Set an oopmap for the call site. This oopmap will only be used if we 3664 // are unwinding the stack. Hence, all locations will be dead. 3665 // Callee-saved registers will be the same as the frame above (i.e., 3666 // handle_exception_stub), since they were restored when we got the 3667 // exception. 3668 3669 OopMapSet* oop_maps = new OopMapSet(); 3670 3671 oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0)); 3672 3673 __ reset_last_Java_frame(false); 3674 3675 // Restore callee-saved registers 3676 3677 // rbp is an implicitly saved callee-saved register (i.e., the calling 3678 // convention will save restore it in prolog/epilog) Other than that 3679 // there are no callee save registers now that adapter frames are gone. 3680 3681 __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt)); 3682 3683 __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog 3684 __ pop(rdx); // No need for exception pc anymore 3685 3686 // rax: exception handler 3687 3688 // We have a handler in rax (could be deopt blob). 3689 __ mov(r8, rax); 3690 3691 // Get the exception oop 3692 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 3693 // Get the exception pc in case we are deoptimized 3694 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 3695 #ifdef ASSERT 3696 __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD); 3697 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 3698 #endif 3699 // Clear the exception oop so GC no longer processes it as a root. 3700 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD); 3701 3702 // rax: exception oop 3703 // r8: exception handler 3704 // rdx: exception pc 3705 // Jump to handler 3706 3707 __ jmp(r8); 3708 3709 // Make sure all code is generated 3710 masm->flush(); 3711 3712 // Set exception blob 3713 _exception_blob = ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1); 3714 } 3715 #endif // COMPILER2 3716