1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #ifndef _WINDOWS 26 #include "alloca.h" 27 #endif 28 #include "asm/macroAssembler.hpp" 29 #include "asm/macroAssembler.inline.hpp" 30 #include "code/compiledIC.hpp" 31 #include "code/debugInfoRec.hpp" 32 #include "code/nativeInst.hpp" 33 #include "code/vtableStubs.hpp" 34 #include "compiler/oopMap.hpp" 35 #include "gc/shared/collectedHeap.hpp" 36 #include "gc/shared/gcLocker.hpp" 37 #include "gc/shared/barrierSet.hpp" 38 #include "gc/shared/barrierSetAssembler.hpp" 39 #include "interpreter/interpreter.hpp" 40 #include "logging/log.hpp" 41 #include "memory/resourceArea.hpp" 42 #include "memory/universe.hpp" 43 #include "oops/klass.inline.hpp" 44 #include "oops/method.inline.hpp" 45 #include "prims/methodHandles.hpp" 46 #include "runtime/continuation.hpp" 47 #include "runtime/continuationEntry.inline.hpp" 48 #include "runtime/globals.hpp" 49 #include "runtime/jniHandles.hpp" 50 #include "runtime/safepointMechanism.hpp" 51 #include "runtime/sharedRuntime.hpp" 52 #include "runtime/signature.hpp" 53 #include "runtime/stubRoutines.hpp" 54 #include "runtime/timerTrace.hpp" 55 #include "runtime/vframeArray.hpp" 56 #include "runtime/vm_version.hpp" 57 #include "utilities/align.hpp" 58 #include "utilities/checkedCast.hpp" 59 #include "utilities/formatBuffer.hpp" 60 #include "vmreg_x86.inline.hpp" 61 #ifdef COMPILER1 62 #include "c1/c1_Runtime1.hpp" 63 #endif 64 #ifdef COMPILER2 65 #include "opto/runtime.hpp" 66 #endif 67 #if INCLUDE_JVMCI 68 #include "jvmci/jvmciJavaClasses.hpp" 69 #endif 70 71 #define __ masm-> 72 73 #ifdef PRODUCT 74 #define BLOCK_COMMENT(str) /* nothing */ 75 #else 76 #define BLOCK_COMMENT(str) __ block_comment(str) 77 #endif // PRODUCT 78 79 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size; 80 81 class RegisterSaver { 82 // Capture info about frame layout. Layout offsets are in jint 83 // units because compiler frame slots are jints. 84 #define XSAVE_AREA_BEGIN 160 85 #define XSAVE_AREA_YMM_BEGIN 576 86 #define XSAVE_AREA_EGPRS 960 87 #define XSAVE_AREA_OPMASK_BEGIN 1088 88 #define XSAVE_AREA_ZMM_BEGIN 1152 89 #define XSAVE_AREA_UPPERBANK 1664 90 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off 91 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off 92 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off 93 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off 94 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off 95 enum layout { 96 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area 97 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area 98 DEF_XMM_OFFS(0), 99 DEF_XMM_OFFS(1), 100 // 2..15 are implied in range usage 101 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 102 DEF_YMM_OFFS(0), 103 DEF_YMM_OFFS(1), 104 r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt, 105 r16H_off, 106 r17_off, r17H_off, 107 r18_off, r18H_off, 108 r19_off, r19H_off, 109 r20_off, r20H_off, 110 r21_off, r21H_off, 111 r22_off, r22H_off, 112 r23_off, r23H_off, 113 r24_off, r24H_off, 114 r25_off, r25H_off, 115 r26_off, r26H_off, 116 r27_off, r27H_off, 117 r28_off, r28H_off, 118 r29_off, r29H_off, 119 r30_off, r30H_off, 120 r31_off, r31H_off, 121 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 122 DEF_OPMASK_OFFS(0), 123 DEF_OPMASK_OFFS(1), 124 // 2..7 are implied in range usage 125 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 126 DEF_ZMM_OFFS(0), 127 DEF_ZMM_OFFS(1), 128 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt, 129 DEF_ZMM_UPPER_OFFS(16), 130 DEF_ZMM_UPPER_OFFS(17), 131 // 18..31 are implied in range usage 132 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt), 133 fpu_stateH_end, 134 r15_off, r15H_off, 135 r14_off, r14H_off, 136 r13_off, r13H_off, 137 r12_off, r12H_off, 138 r11_off, r11H_off, 139 r10_off, r10H_off, 140 r9_off, r9H_off, 141 r8_off, r8H_off, 142 rdi_off, rdiH_off, 143 rsi_off, rsiH_off, 144 ignore_off, ignoreH_off, // extra copy of rbp 145 rsp_off, rspH_off, 146 rbx_off, rbxH_off, 147 rdx_off, rdxH_off, 148 rcx_off, rcxH_off, 149 rax_off, raxH_off, 150 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state 151 align_off, alignH_off, 152 flags_off, flagsH_off, 153 // The frame sender code expects that rbp will be in the "natural" place and 154 // will override any oopMap setting for it. We must therefore force the layout 155 // so that it agrees with the frame sender code. 156 rbp_off, rbpH_off, // copy of rbp we will restore 157 return_off, returnH_off, // slot for return address 158 reg_save_size // size in compiler stack slots 159 }; 160 161 public: 162 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors); 163 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false); 164 165 // Offsets into the register save area 166 // Used by deoptimization when it is managing result register 167 // values on its own 168 169 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; } 170 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; } 171 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; } 172 static int r15_offset_in_bytes(void) { return BytesPerInt * r15_off; } 173 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; } 174 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; } 175 176 // During deoptimization only the result registers need to be restored, 177 // all the other values have already been extracted. 178 static void restore_result_registers(MacroAssembler* masm); 179 }; 180 181 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) { 182 int off = 0; 183 int num_xmm_regs = XMMRegister::available_xmm_registers(); 184 #if COMPILER2_OR_JVMCI 185 if (save_wide_vectors && UseAVX == 0) { 186 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX 187 } 188 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 189 #else 190 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI 191 #endif 192 193 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated 194 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs); 195 // OopMap frame size is in compiler stack slots (jint's) not bytes or words 196 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 197 // CodeBlob frame size is in words. 198 int frame_size_in_words = frame_size_in_bytes / wordSize; 199 *total_frame_words = frame_size_in_words; 200 201 // Save registers, fpu state, and flags. 202 // We assume caller has already pushed the return address onto the 203 // stack, so rsp is 8-byte aligned here. 204 // We push rpb twice in this sequence because we want the real rbp 205 // to be under the return like a normal enter. 206 207 __ enter(); // rsp becomes 16-byte aligned here 208 __ pushf(); 209 // Make sure rsp stays 16-byte aligned 210 __ subq(rsp, 8); 211 // Push CPU state in multiple of 16 bytes 212 __ save_legacy_gprs(); 213 __ push_FPU_state(); 214 215 216 // push cpu state handles this on EVEX enabled targets 217 if (save_wide_vectors) { 218 // Save upper half of YMM registers(0..15) 219 int base_addr = XSAVE_AREA_YMM_BEGIN; 220 for (int n = 0; n < 16; n++) { 221 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n)); 222 } 223 if (VM_Version::supports_evex()) { 224 // Save upper half of ZMM registers(0..15) 225 base_addr = XSAVE_AREA_ZMM_BEGIN; 226 for (int n = 0; n < 16; n++) { 227 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n)); 228 } 229 // Save full ZMM registers(16..num_xmm_regs) 230 base_addr = XSAVE_AREA_UPPERBANK; 231 off = 0; 232 int vector_len = Assembler::AVX_512bit; 233 for (int n = 16; n < num_xmm_regs; n++) { 234 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 235 } 236 #if COMPILER2_OR_JVMCI 237 base_addr = XSAVE_AREA_OPMASK_BEGIN; 238 off = 0; 239 for(int n = 0; n < KRegister::number_of_registers; n++) { 240 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 241 } 242 #endif 243 } 244 } else { 245 if (VM_Version::supports_evex()) { 246 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 247 int base_addr = XSAVE_AREA_UPPERBANK; 248 off = 0; 249 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 250 for (int n = 16; n < num_xmm_regs; n++) { 251 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 252 } 253 #if COMPILER2_OR_JVMCI 254 base_addr = XSAVE_AREA_OPMASK_BEGIN; 255 off = 0; 256 for(int n = 0; n < KRegister::number_of_registers; n++) { 257 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 258 } 259 #endif 260 } 261 } 262 263 #if COMPILER2_OR_JVMCI 264 if (UseAPX) { 265 int base_addr = XSAVE_AREA_EGPRS; 266 off = 0; 267 for (int n = 16; n < Register::number_of_registers; n++) { 268 __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n)); 269 } 270 } 271 #endif 272 273 __ vzeroupper(); 274 if (frame::arg_reg_save_area_bytes != 0) { 275 // Allocate argument register save area 276 __ subptr(rsp, frame::arg_reg_save_area_bytes); 277 } 278 279 // Set an oopmap for the call site. This oopmap will map all 280 // oop-registers and debug-info registers as callee-saved. This 281 // will allow deoptimization at this safepoint to find all possible 282 // debug-info recordings, as well as let GC find all oops. 283 284 OopMapSet *oop_maps = new OopMapSet(); 285 OopMap* map = new OopMap(frame_size_in_slots, 0); 286 287 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x)) 288 289 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg()); 290 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg()); 291 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg()); 292 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg()); 293 // rbp location is known implicitly by the frame sender code, needs no oopmap 294 // and the location where rbp was saved by is ignored 295 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg()); 296 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg()); 297 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg()); 298 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg()); 299 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg()); 300 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg()); 301 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg()); 302 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg()); 303 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg()); 304 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg()); 305 306 if (UseAPX) { 307 map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg()); 308 map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg()); 309 map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg()); 310 map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg()); 311 map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg()); 312 map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg()); 313 map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg()); 314 map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg()); 315 map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg()); 316 map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg()); 317 map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg()); 318 map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg()); 319 map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg()); 320 map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg()); 321 map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg()); 322 map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg()); 323 } 324 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 325 // on EVEX enabled targets, we get it included in the xsave area 326 off = xmm0_off; 327 int delta = xmm1_off - off; 328 for (int n = 0; n < 16; n++) { 329 XMMRegister xmm_name = as_XMMRegister(n); 330 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()); 331 off += delta; 332 } 333 if (UseAVX > 2) { 334 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 335 off = zmm16_off; 336 delta = zmm17_off - off; 337 for (int n = 16; n < num_xmm_regs; n++) { 338 XMMRegister zmm_name = as_XMMRegister(n); 339 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()); 340 off += delta; 341 } 342 } 343 344 #if COMPILER2_OR_JVMCI 345 if (save_wide_vectors) { 346 // Save upper half of YMM registers(0..15) 347 off = ymm0_off; 348 delta = ymm1_off - ymm0_off; 349 for (int n = 0; n < 16; n++) { 350 XMMRegister ymm_name = as_XMMRegister(n); 351 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4)); 352 off += delta; 353 } 354 if (VM_Version::supports_evex()) { 355 // Save upper half of ZMM registers(0..15) 356 off = zmm0_off; 357 delta = zmm1_off - zmm0_off; 358 for (int n = 0; n < 16; n++) { 359 XMMRegister zmm_name = as_XMMRegister(n); 360 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8)); 361 off += delta; 362 } 363 } 364 } 365 #endif // COMPILER2_OR_JVMCI 366 367 // %%% These should all be a waste but we'll keep things as they were for now 368 if (true) { 369 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next()); 370 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next()); 371 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next()); 372 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next()); 373 // rbp location is known implicitly by the frame sender code, needs no oopmap 374 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next()); 375 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next()); 376 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next()); 377 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next()); 378 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next()); 379 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next()); 380 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next()); 381 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next()); 382 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next()); 383 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next()); 384 if (UseAPX) { 385 map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next()); 386 map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next()); 387 map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next()); 388 map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next()); 389 map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next()); 390 map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next()); 391 map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next()); 392 map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next()); 393 map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next()); 394 map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next()); 395 map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next()); 396 map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next()); 397 map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next()); 398 map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next()); 399 map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next()); 400 map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next()); 401 } 402 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 403 // on EVEX enabled targets, we get it included in the xsave area 404 off = xmm0H_off; 405 delta = xmm1H_off - off; 406 for (int n = 0; n < 16; n++) { 407 XMMRegister xmm_name = as_XMMRegister(n); 408 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next()); 409 off += delta; 410 } 411 if (UseAVX > 2) { 412 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 413 off = zmm16H_off; 414 delta = zmm17H_off - off; 415 for (int n = 16; n < num_xmm_regs; n++) { 416 XMMRegister zmm_name = as_XMMRegister(n); 417 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next()); 418 off += delta; 419 } 420 } 421 } 422 423 return map; 424 } 425 426 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) { 427 int num_xmm_regs = XMMRegister::available_xmm_registers(); 428 if (frame::arg_reg_save_area_bytes != 0) { 429 // Pop arg register save area 430 __ addptr(rsp, frame::arg_reg_save_area_bytes); 431 } 432 433 #if COMPILER2_OR_JVMCI 434 if (restore_wide_vectors) { 435 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); 436 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 437 } 438 #else 439 assert(!restore_wide_vectors, "vectors are generated only by C2"); 440 #endif 441 442 __ vzeroupper(); 443 444 // On EVEX enabled targets everything is handled in pop fpu state 445 if (restore_wide_vectors) { 446 // Restore upper half of YMM registers (0..15) 447 int base_addr = XSAVE_AREA_YMM_BEGIN; 448 for (int n = 0; n < 16; n++) { 449 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16)); 450 } 451 if (VM_Version::supports_evex()) { 452 // Restore upper half of ZMM registers (0..15) 453 base_addr = XSAVE_AREA_ZMM_BEGIN; 454 for (int n = 0; n < 16; n++) { 455 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32)); 456 } 457 // Restore full ZMM registers(16..num_xmm_regs) 458 base_addr = XSAVE_AREA_UPPERBANK; 459 int vector_len = Assembler::AVX_512bit; 460 int off = 0; 461 for (int n = 16; n < num_xmm_regs; n++) { 462 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 463 } 464 #if COMPILER2_OR_JVMCI 465 base_addr = XSAVE_AREA_OPMASK_BEGIN; 466 off = 0; 467 for (int n = 0; n < KRegister::number_of_registers; n++) { 468 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 469 } 470 #endif 471 } 472 } else { 473 if (VM_Version::supports_evex()) { 474 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 475 int base_addr = XSAVE_AREA_UPPERBANK; 476 int off = 0; 477 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 478 for (int n = 16; n < num_xmm_regs; n++) { 479 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 480 } 481 #if COMPILER2_OR_JVMCI 482 base_addr = XSAVE_AREA_OPMASK_BEGIN; 483 off = 0; 484 for (int n = 0; n < KRegister::number_of_registers; n++) { 485 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 486 } 487 #endif 488 } 489 } 490 491 #if COMPILER2_OR_JVMCI 492 if (UseAPX) { 493 int base_addr = XSAVE_AREA_EGPRS; 494 int off = 0; 495 for (int n = 16; n < Register::number_of_registers; n++) { 496 __ movq(as_Register(n), Address(rsp, base_addr+(off++*8))); 497 } 498 } 499 #endif 500 501 // Recover CPU state 502 __ pop_FPU_state(); 503 __ restore_legacy_gprs(); 504 __ addq(rsp, 8); 505 __ popf(); 506 // Get the rbp described implicitly by the calling convention (no oopMap) 507 __ pop(rbp); 508 } 509 510 void RegisterSaver::restore_result_registers(MacroAssembler* masm) { 511 512 // Just restore result register. Only used by deoptimization. By 513 // now any callee save register that needs to be restored to a c2 514 // caller of the deoptee has been extracted into the vframeArray 515 // and will be stuffed into the c2i adapter we create for later 516 // restoration so only result registers need to be restored here. 517 518 // Restore fp result register 519 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes())); 520 // Restore integer result register 521 __ movptr(rax, Address(rsp, rax_offset_in_bytes())); 522 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes())); 523 524 // Pop all of the register save are off the stack except the return address 525 __ addptr(rsp, return_offset_in_bytes()); 526 } 527 528 // Is vector's size (in bytes) bigger than a size saved by default? 529 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions. 530 bool SharedRuntime::is_wide_vector(int size) { 531 return size > 16; 532 } 533 534 // --------------------------------------------------------------------------- 535 // Read the array of BasicTypes from a signature, and compute where the 536 // arguments should go. Values in the VMRegPair regs array refer to 4-byte 537 // quantities. Values less than VMRegImpl::stack0 are registers, those above 538 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer 539 // as framesizes are fixed. 540 // VMRegImpl::stack0 refers to the first slot 0(sp). 541 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher. 542 // Register up to Register::number_of_registers are the 64-bit 543 // integer registers. 544 545 // Note: the INPUTS in sig_bt are in units of Java argument words, which are 546 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit 547 // units regardless of build. Of course for i486 there is no 64 bit build 548 549 // The Java calling convention is a "shifted" version of the C ABI. 550 // By skipping the first C ABI register we can call non-static jni methods 551 // with small numbers of arguments without having to shuffle the arguments 552 // at all. Since we control the java ABI we ought to at least get some 553 // advantage out of it. 554 555 int SharedRuntime::java_calling_convention(const BasicType *sig_bt, 556 VMRegPair *regs, 557 int total_args_passed) { 558 559 // Create the mapping between argument positions and 560 // registers. 561 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = { 562 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5 563 }; 564 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = { 565 j_farg0, j_farg1, j_farg2, j_farg3, 566 j_farg4, j_farg5, j_farg6, j_farg7 567 }; 568 569 570 uint int_args = 0; 571 uint fp_args = 0; 572 uint stk_args = 0; 573 574 for (int i = 0; i < total_args_passed; i++) { 575 switch (sig_bt[i]) { 576 case T_BOOLEAN: 577 case T_CHAR: 578 case T_BYTE: 579 case T_SHORT: 580 case T_INT: 581 if (int_args < Argument::n_int_register_parameters_j) { 582 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 583 } else { 584 stk_args = align_up(stk_args, 2); 585 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 586 stk_args += 1; 587 } 588 break; 589 case T_VOID: 590 // halves of T_LONG or T_DOUBLE 591 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 592 regs[i].set_bad(); 593 break; 594 case T_LONG: 595 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 596 // fall through 597 case T_OBJECT: 598 case T_ARRAY: 599 case T_ADDRESS: 600 if (int_args < Argument::n_int_register_parameters_j) { 601 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 602 } else { 603 stk_args = align_up(stk_args, 2); 604 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 605 stk_args += 2; 606 } 607 break; 608 case T_FLOAT: 609 if (fp_args < Argument::n_float_register_parameters_j) { 610 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 611 } else { 612 stk_args = align_up(stk_args, 2); 613 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 614 stk_args += 1; 615 } 616 break; 617 case T_DOUBLE: 618 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 619 if (fp_args < Argument::n_float_register_parameters_j) { 620 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 621 } else { 622 stk_args = align_up(stk_args, 2); 623 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 624 stk_args += 2; 625 } 626 break; 627 default: 628 ShouldNotReachHere(); 629 break; 630 } 631 } 632 633 return stk_args; 634 } 635 636 // Patch the callers callsite with entry to compiled code if it exists. 637 static void patch_callers_callsite(MacroAssembler *masm) { 638 Label L; 639 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 640 __ jcc(Assembler::equal, L); 641 642 // Save the current stack pointer 643 __ mov(r13, rsp); 644 // Schedule the branch target address early. 645 // Call into the VM to patch the caller, then jump to compiled callee 646 // rax isn't live so capture return address while we easily can 647 __ movptr(rax, Address(rsp, 0)); 648 649 // align stack so push_CPU_state doesn't fault 650 __ andptr(rsp, -(StackAlignmentInBytes)); 651 __ push_CPU_state(); 652 __ vzeroupper(); 653 // VM needs caller's callsite 654 // VM needs target method 655 // This needs to be a long call since we will relocate this adapter to 656 // the codeBuffer and it may not reach 657 658 // Allocate argument register save area 659 if (frame::arg_reg_save_area_bytes != 0) { 660 __ subptr(rsp, frame::arg_reg_save_area_bytes); 661 } 662 __ mov(c_rarg0, rbx); 663 __ mov(c_rarg1, rax); 664 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite))); 665 666 // De-allocate argument register save area 667 if (frame::arg_reg_save_area_bytes != 0) { 668 __ addptr(rsp, frame::arg_reg_save_area_bytes); 669 } 670 671 __ vzeroupper(); 672 __ pop_CPU_state(); 673 // restore sp 674 __ mov(rsp, r13); 675 __ bind(L); 676 } 677 678 static void gen_c2i_adapter(MacroAssembler *masm, 679 int total_args_passed, 680 int comp_args_on_stack, 681 const BasicType *sig_bt, 682 const VMRegPair *regs, 683 Label& skip_fixup) { 684 // Before we get into the guts of the C2I adapter, see if we should be here 685 // at all. We've come from compiled code and are attempting to jump to the 686 // interpreter, which means the caller made a static call to get here 687 // (vcalls always get a compiled target if there is one). Check for a 688 // compiled target. If there is one, we need to patch the caller's call. 689 patch_callers_callsite(masm); 690 691 __ bind(skip_fixup); 692 693 // Since all args are passed on the stack, total_args_passed * 694 // Interpreter::stackElementSize is the space we need. 695 696 assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed); 697 698 int extraspace = (total_args_passed * Interpreter::stackElementSize); 699 700 // stack is aligned, keep it that way 701 // This is not currently needed or enforced by the interpreter, but 702 // we might as well conform to the ABI. 703 extraspace = align_up(extraspace, 2*wordSize); 704 705 // set senderSP value 706 __ lea(r13, Address(rsp, wordSize)); 707 708 #ifdef ASSERT 709 __ check_stack_alignment(r13, "sender stack not aligned"); 710 #endif 711 if (extraspace > 0) { 712 // Pop the return address 713 __ pop(rax); 714 715 __ subptr(rsp, extraspace); 716 717 // Push the return address 718 __ push(rax); 719 720 // Account for the return address location since we store it first rather 721 // than hold it in a register across all the shuffling 722 extraspace += wordSize; 723 } 724 725 #ifdef ASSERT 726 __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax); 727 #endif 728 729 // Now write the args into the outgoing interpreter space 730 for (int i = 0; i < total_args_passed; i++) { 731 if (sig_bt[i] == T_VOID) { 732 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 733 continue; 734 } 735 736 // offset to start parameters 737 int st_off = (total_args_passed - i) * Interpreter::stackElementSize; 738 int next_off = st_off - Interpreter::stackElementSize; 739 740 // Say 4 args: 741 // i st_off 742 // 0 32 T_LONG 743 // 1 24 T_VOID 744 // 2 16 T_OBJECT 745 // 3 8 T_BOOL 746 // - 0 return address 747 // 748 // However to make thing extra confusing. Because we can fit a long/double in 749 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter 750 // leaves one slot empty and only stores to a single slot. In this case the 751 // slot that is occupied is the T_VOID slot. See I said it was confusing. 752 753 VMReg r_1 = regs[i].first(); 754 VMReg r_2 = regs[i].second(); 755 if (!r_1->is_valid()) { 756 assert(!r_2->is_valid(), ""); 757 continue; 758 } 759 if (r_1->is_stack()) { 760 // memory to memory use rax 761 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace; 762 if (!r_2->is_valid()) { 763 // sign extend?? 764 __ movl(rax, Address(rsp, ld_off)); 765 __ movptr(Address(rsp, st_off), rax); 766 767 } else { 768 769 __ movq(rax, Address(rsp, ld_off)); 770 771 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 772 // T_DOUBLE and T_LONG use two slots in the interpreter 773 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 774 // ld_off == LSW, ld_off+wordSize == MSW 775 // st_off == MSW, next_off == LSW 776 __ movq(Address(rsp, next_off), rax); 777 #ifdef ASSERT 778 // Overwrite the unused slot with known junk 779 __ mov64(rax, CONST64(0xdeadffffdeadaaaa)); 780 __ movptr(Address(rsp, st_off), rax); 781 #endif /* ASSERT */ 782 } else { 783 __ movq(Address(rsp, st_off), rax); 784 } 785 } 786 } else if (r_1->is_Register()) { 787 Register r = r_1->as_Register(); 788 if (!r_2->is_valid()) { 789 // must be only an int (or less ) so move only 32bits to slot 790 // why not sign extend?? 791 __ movl(Address(rsp, st_off), r); 792 } else { 793 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 794 // T_DOUBLE and T_LONG use two slots in the interpreter 795 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 796 // long/double in gpr 797 #ifdef ASSERT 798 // Overwrite the unused slot with known junk 799 __ mov64(rax, CONST64(0xdeadffffdeadaaab)); 800 __ movptr(Address(rsp, st_off), rax); 801 #endif /* ASSERT */ 802 __ movq(Address(rsp, next_off), r); 803 } else { 804 __ movptr(Address(rsp, st_off), r); 805 } 806 } 807 } else { 808 assert(r_1->is_XMMRegister(), ""); 809 if (!r_2->is_valid()) { 810 // only a float use just part of the slot 811 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister()); 812 } else { 813 #ifdef ASSERT 814 // Overwrite the unused slot with known junk 815 __ mov64(rax, CONST64(0xdeadffffdeadaaac)); 816 __ movptr(Address(rsp, st_off), rax); 817 #endif /* ASSERT */ 818 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister()); 819 } 820 } 821 } 822 823 // Schedule the branch target address early. 824 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset()))); 825 __ jmp(rcx); 826 } 827 828 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm, 829 int total_args_passed, 830 int comp_args_on_stack, 831 const BasicType *sig_bt, 832 const VMRegPair *regs) { 833 834 // Note: r13 contains the senderSP on entry. We must preserve it since 835 // we may do a i2c -> c2i transition if we lose a race where compiled 836 // code goes non-entrant while we get args ready. 837 // In addition we use r13 to locate all the interpreter args as 838 // we must align the stack to 16 bytes on an i2c entry else we 839 // lose alignment we expect in all compiled code and register 840 // save code can segv when fxsave instructions find improperly 841 // aligned stack pointer. 842 843 // Adapters can be frameless because they do not require the caller 844 // to perform additional cleanup work, such as correcting the stack pointer. 845 // An i2c adapter is frameless because the *caller* frame, which is interpreted, 846 // routinely repairs its own stack pointer (from interpreter_frame_last_sp), 847 // even if a callee has modified the stack pointer. 848 // A c2i adapter is frameless because the *callee* frame, which is interpreted, 849 // routinely repairs its caller's stack pointer (from sender_sp, which is set 850 // up via the senderSP register). 851 // In other words, if *either* the caller or callee is interpreted, we can 852 // get the stack pointer repaired after a call. 853 // This is why c2i and i2c adapters cannot be indefinitely composed. 854 // In particular, if a c2i adapter were to somehow call an i2c adapter, 855 // both caller and callee would be compiled methods, and neither would 856 // clean up the stack pointer changes performed by the two adapters. 857 // If this happens, control eventually transfers back to the compiled 858 // caller, but with an uncorrected stack, causing delayed havoc. 859 860 // Must preserve original SP for loading incoming arguments because 861 // we need to align the outgoing SP for compiled code. 862 __ movptr(r11, rsp); 863 864 // Pick up the return address 865 __ pop(rax); 866 867 // Convert 4-byte c2 stack slots to words. 868 int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord; 869 870 if (comp_args_on_stack) { 871 __ subptr(rsp, comp_words_on_stack * wordSize); 872 } 873 874 // Ensure compiled code always sees stack at proper alignment 875 __ andptr(rsp, -16); 876 877 // push the return address and misalign the stack that youngest frame always sees 878 // as far as the placement of the call instruction 879 __ push(rax); 880 881 // Put saved SP in another register 882 const Register saved_sp = rax; 883 __ movptr(saved_sp, r11); 884 885 // Will jump to the compiled code just as if compiled code was doing it. 886 // Pre-load the register-jump target early, to schedule it better. 887 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset()))); 888 889 #if INCLUDE_JVMCI 890 if (EnableJVMCI) { 891 // check if this call should be routed towards a specific entry point 892 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 893 Label no_alternative_target; 894 __ jcc(Assembler::equal, no_alternative_target); 895 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset()))); 896 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 897 __ bind(no_alternative_target); 898 } 899 #endif // INCLUDE_JVMCI 900 901 // Now generate the shuffle code. Pick up all register args and move the 902 // rest through the floating point stack top. 903 for (int i = 0; i < total_args_passed; i++) { 904 if (sig_bt[i] == T_VOID) { 905 // Longs and doubles are passed in native word order, but misaligned 906 // in the 32-bit build. 907 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 908 continue; 909 } 910 911 // Pick up 0, 1 or 2 words from SP+offset. 912 913 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), 914 "scrambled load targets?"); 915 // Load in argument order going down. 916 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize; 917 // Point to interpreter value (vs. tag) 918 int next_off = ld_off - Interpreter::stackElementSize; 919 // 920 // 921 // 922 VMReg r_1 = regs[i].first(); 923 VMReg r_2 = regs[i].second(); 924 if (!r_1->is_valid()) { 925 assert(!r_2->is_valid(), ""); 926 continue; 927 } 928 if (r_1->is_stack()) { 929 // Convert stack slot to an SP offset (+ wordSize to account for return address ) 930 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize; 931 932 // We can use r13 as a temp here because compiled code doesn't need r13 as an input 933 // and if we end up going thru a c2i because of a miss a reasonable value of r13 934 // will be generated. 935 if (!r_2->is_valid()) { 936 // sign extend??? 937 __ movl(r13, Address(saved_sp, ld_off)); 938 __ movptr(Address(rsp, st_off), r13); 939 } else { 940 // 941 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 942 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 943 // So we must adjust where to pick up the data to match the interpreter. 944 // 945 // Interpreter local[n] == MSW, local[n+1] == LSW however locals 946 // are accessed as negative so LSW is at LOW address 947 948 // ld_off is MSW so get LSW 949 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 950 next_off : ld_off; 951 __ movq(r13, Address(saved_sp, offset)); 952 // st_off is LSW (i.e. reg.first()) 953 __ movq(Address(rsp, st_off), r13); 954 } 955 } else if (r_1->is_Register()) { // Register argument 956 Register r = r_1->as_Register(); 957 assert(r != rax, "must be different"); 958 if (r_2->is_valid()) { 959 // 960 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 961 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 962 // So we must adjust where to pick up the data to match the interpreter. 963 964 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 965 next_off : ld_off; 966 967 // this can be a misaligned move 968 __ movq(r, Address(saved_sp, offset)); 969 } else { 970 // sign extend and use a full word? 971 __ movl(r, Address(saved_sp, ld_off)); 972 } 973 } else { 974 if (!r_2->is_valid()) { 975 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off)); 976 } else { 977 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off)); 978 } 979 } 980 } 981 982 __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about 983 984 // 6243940 We might end up in handle_wrong_method if 985 // the callee is deoptimized as we race thru here. If that 986 // happens we don't want to take a safepoint because the 987 // caller frame will look interpreted and arguments are now 988 // "compiled" so it is much better to make this transition 989 // invisible to the stack walking code. Unfortunately if 990 // we try and find the callee by normal means a safepoint 991 // is possible. So we stash the desired callee in the thread 992 // and the vm will find there should this case occur. 993 994 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx); 995 996 // put Method* where a c2i would expect should we end up there 997 // only needed because eof c2 resolve stubs return Method* as a result in 998 // rax 999 __ mov(rax, rbx); 1000 __ jmp(r11); 1001 } 1002 1003 // --------------------------------------------------------------- 1004 void SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm, 1005 int total_args_passed, 1006 int comp_args_on_stack, 1007 const BasicType *sig_bt, 1008 const VMRegPair *regs, 1009 AdapterHandlerEntry* handler) { 1010 address i2c_entry = __ pc(); 1011 1012 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs); 1013 1014 // ------------------------------------------------------------------------- 1015 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls 1016 // to the interpreter. The args start out packed in the compiled layout. They 1017 // need to be unpacked into the interpreter layout. This will almost always 1018 // require some stack space. We grow the current (compiled) stack, then repack 1019 // the args. We finally end in a jump to the generic interpreter entry point. 1020 // On exit from the interpreter, the interpreter will restore our SP (lest the 1021 // compiled code, which relies solely on SP and not RBP, get sick). 1022 1023 address c2i_unverified_entry = __ pc(); 1024 Label skip_fixup; 1025 1026 Register data = rax; 1027 Register receiver = j_rarg0; 1028 Register temp = rbx; 1029 1030 { 1031 __ ic_check(1 /* end_alignment */); 1032 __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset())); 1033 // Method might have been compiled since the call site was patched to 1034 // interpreted if that is the case treat it as a miss so we can get 1035 // the call site corrected. 1036 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 1037 __ jcc(Assembler::equal, skip_fixup); 1038 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1039 } 1040 1041 address c2i_entry = __ pc(); 1042 1043 // Class initialization barrier for static methods 1044 address c2i_no_clinit_check_entry = nullptr; 1045 if (VM_Version::supports_fast_class_init_checks()) { 1046 Label L_skip_barrier; 1047 Register method = rbx; 1048 1049 { // Bypass the barrier for non-static methods 1050 Register flags = rscratch1; 1051 __ load_unsigned_short(flags, Address(method, Method::access_flags_offset())); 1052 __ testl(flags, JVM_ACC_STATIC); 1053 __ jcc(Assembler::zero, L_skip_barrier); // non-static 1054 } 1055 1056 Register klass = rscratch1; 1057 __ load_method_holder(klass, method); 1058 __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/); 1059 1060 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1061 1062 __ bind(L_skip_barrier); 1063 c2i_no_clinit_check_entry = __ pc(); 1064 } 1065 1066 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1067 bs->c2i_entry_barrier(masm); 1068 1069 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup); 1070 1071 handler->set_entry_points(i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry); 1072 return; 1073 } 1074 1075 int SharedRuntime::c_calling_convention(const BasicType *sig_bt, 1076 VMRegPair *regs, 1077 int total_args_passed) { 1078 1079 // We return the amount of VMRegImpl stack slots we need to reserve for all 1080 // the arguments NOT counting out_preserve_stack_slots. 1081 1082 // NOTE: These arrays will have to change when c1 is ported 1083 #ifdef _WIN64 1084 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1085 c_rarg0, c_rarg1, c_rarg2, c_rarg3 1086 }; 1087 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1088 c_farg0, c_farg1, c_farg2, c_farg3 1089 }; 1090 #else 1091 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1092 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5 1093 }; 1094 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1095 c_farg0, c_farg1, c_farg2, c_farg3, 1096 c_farg4, c_farg5, c_farg6, c_farg7 1097 }; 1098 #endif // _WIN64 1099 1100 1101 uint int_args = 0; 1102 uint fp_args = 0; 1103 uint stk_args = 0; // inc by 2 each time 1104 1105 for (int i = 0; i < total_args_passed; i++) { 1106 switch (sig_bt[i]) { 1107 case T_BOOLEAN: 1108 case T_CHAR: 1109 case T_BYTE: 1110 case T_SHORT: 1111 case T_INT: 1112 if (int_args < Argument::n_int_register_parameters_c) { 1113 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 1114 #ifdef _WIN64 1115 fp_args++; 1116 // Allocate slots for callee to stuff register args the stack. 1117 stk_args += 2; 1118 #endif 1119 } else { 1120 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1121 stk_args += 2; 1122 } 1123 break; 1124 case T_LONG: 1125 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1126 // fall through 1127 case T_OBJECT: 1128 case T_ARRAY: 1129 case T_ADDRESS: 1130 case T_METADATA: 1131 if (int_args < Argument::n_int_register_parameters_c) { 1132 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 1133 #ifdef _WIN64 1134 fp_args++; 1135 stk_args += 2; 1136 #endif 1137 } else { 1138 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1139 stk_args += 2; 1140 } 1141 break; 1142 case T_FLOAT: 1143 if (fp_args < Argument::n_float_register_parameters_c) { 1144 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 1145 #ifdef _WIN64 1146 int_args++; 1147 // Allocate slots for callee to stuff register args the stack. 1148 stk_args += 2; 1149 #endif 1150 } else { 1151 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1152 stk_args += 2; 1153 } 1154 break; 1155 case T_DOUBLE: 1156 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1157 if (fp_args < Argument::n_float_register_parameters_c) { 1158 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 1159 #ifdef _WIN64 1160 int_args++; 1161 // Allocate slots for callee to stuff register args the stack. 1162 stk_args += 2; 1163 #endif 1164 } else { 1165 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1166 stk_args += 2; 1167 } 1168 break; 1169 case T_VOID: // Halves of longs and doubles 1170 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 1171 regs[i].set_bad(); 1172 break; 1173 default: 1174 ShouldNotReachHere(); 1175 break; 1176 } 1177 } 1178 #ifdef _WIN64 1179 // windows abi requires that we always allocate enough stack space 1180 // for 4 64bit registers to be stored down. 1181 if (stk_args < 8) { 1182 stk_args = 8; 1183 } 1184 #endif // _WIN64 1185 1186 return stk_args; 1187 } 1188 1189 int SharedRuntime::vector_calling_convention(VMRegPair *regs, 1190 uint num_bits, 1191 uint total_args_passed) { 1192 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512, 1193 "only certain vector sizes are supported for now"); 1194 1195 static const XMMRegister VEC_ArgReg[32] = { 1196 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 1197 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1198 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23, 1199 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31 1200 }; 1201 1202 uint stk_args = 0; 1203 uint fp_args = 0; 1204 1205 for (uint i = 0; i < total_args_passed; i++) { 1206 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg(); 1207 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15)); 1208 regs[i].set_pair(vmreg->next(next_val), vmreg); 1209 } 1210 1211 return stk_args; 1212 } 1213 1214 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1215 // We always ignore the frame_slots arg and just use the space just below frame pointer 1216 // which by this time is free to use 1217 switch (ret_type) { 1218 case T_FLOAT: 1219 __ movflt(Address(rbp, -wordSize), xmm0); 1220 break; 1221 case T_DOUBLE: 1222 __ movdbl(Address(rbp, -wordSize), xmm0); 1223 break; 1224 case T_VOID: break; 1225 default: { 1226 __ movptr(Address(rbp, -wordSize), rax); 1227 } 1228 } 1229 } 1230 1231 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1232 // We always ignore the frame_slots arg and just use the space just below frame pointer 1233 // which by this time is free to use 1234 switch (ret_type) { 1235 case T_FLOAT: 1236 __ movflt(xmm0, Address(rbp, -wordSize)); 1237 break; 1238 case T_DOUBLE: 1239 __ movdbl(xmm0, Address(rbp, -wordSize)); 1240 break; 1241 case T_VOID: break; 1242 default: { 1243 __ movptr(rax, Address(rbp, -wordSize)); 1244 } 1245 } 1246 } 1247 1248 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1249 for ( int i = first_arg ; i < arg_count ; i++ ) { 1250 if (args[i].first()->is_Register()) { 1251 __ push(args[i].first()->as_Register()); 1252 } else if (args[i].first()->is_XMMRegister()) { 1253 __ subptr(rsp, 2*wordSize); 1254 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister()); 1255 } 1256 } 1257 } 1258 1259 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1260 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) { 1261 if (args[i].first()->is_Register()) { 1262 __ pop(args[i].first()->as_Register()); 1263 } else if (args[i].first()->is_XMMRegister()) { 1264 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0)); 1265 __ addptr(rsp, 2*wordSize); 1266 } 1267 } 1268 } 1269 1270 static void verify_oop_args(MacroAssembler* masm, 1271 const methodHandle& method, 1272 const BasicType* sig_bt, 1273 const VMRegPair* regs) { 1274 Register temp_reg = rbx; // not part of any compiled calling seq 1275 if (VerifyOops) { 1276 for (int i = 0; i < method->size_of_parameters(); i++) { 1277 if (is_reference_type(sig_bt[i])) { 1278 VMReg r = regs[i].first(); 1279 assert(r->is_valid(), "bad oop arg"); 1280 if (r->is_stack()) { 1281 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1282 __ verify_oop(temp_reg); 1283 } else { 1284 __ verify_oop(r->as_Register()); 1285 } 1286 } 1287 } 1288 } 1289 } 1290 1291 static void check_continuation_enter_argument(VMReg actual_vmreg, 1292 Register expected_reg, 1293 const char* name) { 1294 assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name); 1295 assert(actual_vmreg->as_Register() == expected_reg, 1296 "%s is in unexpected register: %s instead of %s", 1297 name, actual_vmreg->as_Register()->name(), expected_reg->name()); 1298 } 1299 1300 1301 //---------------------------- continuation_enter_setup --------------------------- 1302 // 1303 // Arguments: 1304 // None. 1305 // 1306 // Results: 1307 // rsp: pointer to blank ContinuationEntry 1308 // 1309 // Kills: 1310 // rax 1311 // 1312 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) { 1313 assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, ""); 1314 assert(in_bytes(ContinuationEntry::cont_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1315 assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1316 1317 stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize; 1318 __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1319 1320 int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size; 1321 OopMap* map = new OopMap(frame_size, 0); 1322 1323 __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset())); 1324 __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax); 1325 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp); 1326 1327 return map; 1328 } 1329 1330 //---------------------------- fill_continuation_entry --------------------------- 1331 // 1332 // Arguments: 1333 // rsp: pointer to blank Continuation entry 1334 // reg_cont_obj: pointer to the continuation 1335 // reg_flags: flags 1336 // 1337 // Results: 1338 // rsp: pointer to filled out ContinuationEntry 1339 // 1340 // Kills: 1341 // rax 1342 // 1343 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) { 1344 assert_different_registers(rax, reg_cont_obj, reg_flags); 1345 #ifdef ASSERT 1346 __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value()); 1347 #endif 1348 __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj); 1349 __ movl (Address(rsp, ContinuationEntry::flags_offset()), reg_flags); 1350 __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0); 1351 __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0); 1352 __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0); 1353 1354 __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset())); 1355 __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax); 1356 __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset())); 1357 __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax); 1358 1359 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0); 1360 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0); 1361 } 1362 1363 //---------------------------- continuation_enter_cleanup --------------------------- 1364 // 1365 // Arguments: 1366 // rsp: pointer to the ContinuationEntry 1367 // 1368 // Results: 1369 // rsp: pointer to the spilled rbp in the entry frame 1370 // 1371 // Kills: 1372 // rbx 1373 // 1374 static void continuation_enter_cleanup(MacroAssembler* masm) { 1375 #ifdef ASSERT 1376 Label L_good_sp; 1377 __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1378 __ jcc(Assembler::equal, L_good_sp); 1379 __ stop("Incorrect rsp at continuation_enter_cleanup"); 1380 __ bind(L_good_sp); 1381 #endif 1382 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset())); 1383 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx); 1384 1385 if (CheckJNICalls) { 1386 // Check if this is a virtual thread continuation 1387 Label L_skip_vthread_code; 1388 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0); 1389 __ jcc(Assembler::equal, L_skip_vthread_code); 1390 1391 // If the held monitor count is > 0 and this vthread is terminating then 1392 // it failed to release a JNI monitor. So we issue the same log message 1393 // that JavaThread::exit does. 1394 __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1395 __ jcc(Assembler::equal, L_skip_vthread_code); 1396 1397 // rax may hold an exception oop, save it before the call 1398 __ push(rax); 1399 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held)); 1400 __ pop(rax); 1401 1402 // For vthreads we have to explicitly zero the JNI monitor count of the carrier 1403 // on termination. The held count is implicitly zeroed below when we restore from 1404 // the parent held count (which has to be zero). 1405 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1406 1407 __ bind(L_skip_vthread_code); 1408 } 1409 #ifdef ASSERT 1410 else { 1411 // Check if this is a virtual thread continuation 1412 Label L_skip_vthread_code; 1413 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0); 1414 __ jcc(Assembler::equal, L_skip_vthread_code); 1415 1416 // See comment just above. If not checking JNI calls the JNI count is only 1417 // needed for assertion checking. 1418 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1419 1420 __ bind(L_skip_vthread_code); 1421 } 1422 #endif 1423 1424 __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset())); 1425 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx); 1426 1427 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset())); 1428 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx); 1429 __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1430 } 1431 1432 static void gen_continuation_enter(MacroAssembler* masm, 1433 const VMRegPair* regs, 1434 int& exception_offset, 1435 OopMapSet* oop_maps, 1436 int& frame_complete, 1437 int& stack_slots, 1438 int& interpreted_entry_offset, 1439 int& compiled_entry_offset) { 1440 1441 // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread) 1442 int pos_cont_obj = 0; 1443 int pos_is_cont = 1; 1444 int pos_is_virtual = 2; 1445 1446 // The platform-specific calling convention may present the arguments in various registers. 1447 // To simplify the rest of the code, we expect the arguments to reside at these known 1448 // registers, and we additionally check the placement here in case calling convention ever 1449 // changes. 1450 Register reg_cont_obj = c_rarg1; 1451 Register reg_is_cont = c_rarg2; 1452 Register reg_is_virtual = c_rarg3; 1453 1454 check_continuation_enter_argument(regs[pos_cont_obj].first(), reg_cont_obj, "Continuation object"); 1455 check_continuation_enter_argument(regs[pos_is_cont].first(), reg_is_cont, "isContinue"); 1456 check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread"); 1457 1458 // Utility methods kill rax, make sure there are no collisions 1459 assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual); 1460 1461 AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(), 1462 relocInfo::static_call_type); 1463 1464 address start = __ pc(); 1465 1466 Label L_thaw, L_exit; 1467 1468 // i2i entry used at interp_only_mode only 1469 interpreted_entry_offset = __ pc() - start; 1470 { 1471 #ifdef ASSERT 1472 Label is_interp_only; 1473 __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0); 1474 __ jcc(Assembler::notEqual, is_interp_only); 1475 __ stop("enterSpecial interpreter entry called when not in interp_only_mode"); 1476 __ bind(is_interp_only); 1477 #endif 1478 1479 __ pop(rax); // return address 1480 // Read interpreter arguments into registers (this is an ad-hoc i2c adapter) 1481 __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2)); 1482 __ movl(c_rarg2, Address(rsp, Interpreter::stackElementSize*1)); 1483 __ movl(c_rarg3, Address(rsp, Interpreter::stackElementSize*0)); 1484 __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment 1485 __ push(rax); // return address 1486 __ push_cont_fastpath(); 1487 1488 __ enter(); 1489 1490 stack_slots = 2; // will be adjusted in setup 1491 OopMap* map = continuation_enter_setup(masm, stack_slots); 1492 // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe, 1493 // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway. 1494 1495 __ verify_oop(reg_cont_obj); 1496 1497 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1498 1499 // If continuation, call to thaw. Otherwise, resolve the call and exit. 1500 __ testptr(reg_is_cont, reg_is_cont); 1501 __ jcc(Assembler::notZero, L_thaw); 1502 1503 // --- Resolve path 1504 1505 // Make sure the call is patchable 1506 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1507 // Emit stub for static call 1508 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc()); 1509 if (stub == nullptr) { 1510 fatal("CodeCache is full at gen_continuation_enter"); 1511 } 1512 __ call(resolve); 1513 oop_maps->add_gc_map(__ pc() - start, map); 1514 __ post_call_nop(); 1515 1516 __ jmp(L_exit); 1517 } 1518 1519 // compiled entry 1520 __ align(CodeEntryAlignment); 1521 compiled_entry_offset = __ pc() - start; 1522 __ enter(); 1523 1524 stack_slots = 2; // will be adjusted in setup 1525 OopMap* map = continuation_enter_setup(masm, stack_slots); 1526 1527 // Frame is now completed as far as size and linkage. 1528 frame_complete = __ pc() - start; 1529 1530 __ verify_oop(reg_cont_obj); 1531 1532 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1533 1534 // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue) 1535 __ testptr(reg_is_cont, reg_is_cont); 1536 __ jccb(Assembler::notZero, L_thaw); 1537 1538 // --- call Continuation.enter(Continuation c, boolean isContinue) 1539 1540 // Make sure the call is patchable 1541 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1542 1543 // Emit stub for static call 1544 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc()); 1545 if (stub == nullptr) { 1546 fatal("CodeCache is full at gen_continuation_enter"); 1547 } 1548 1549 // The call needs to be resolved. There's a special case for this in 1550 // SharedRuntime::find_callee_info_helper() which calls 1551 // LinkResolver::resolve_continuation_enter() which resolves the call to 1552 // Continuation.enter(Continuation c, boolean isContinue). 1553 __ call(resolve); 1554 1555 oop_maps->add_gc_map(__ pc() - start, map); 1556 __ post_call_nop(); 1557 1558 __ jmpb(L_exit); 1559 1560 // --- Thawing path 1561 1562 __ bind(L_thaw); 1563 1564 ContinuationEntry::_thaw_call_pc_offset = __ pc() - start; 1565 __ call(RuntimeAddress(StubRoutines::cont_thaw())); 1566 1567 ContinuationEntry::_return_pc_offset = __ pc() - start; 1568 oop_maps->add_gc_map(__ pc() - start, map->deep_copy()); 1569 __ post_call_nop(); 1570 1571 // --- Normal exit (resolve/thawing) 1572 1573 __ bind(L_exit); 1574 ContinuationEntry::_cleanup_offset = __ pc() - start; 1575 continuation_enter_cleanup(masm); 1576 __ pop(rbp); 1577 __ ret(0); 1578 1579 // --- Exception handling path 1580 1581 exception_offset = __ pc() - start; 1582 1583 continuation_enter_cleanup(masm); 1584 __ pop(rbp); 1585 1586 __ movptr(c_rarg0, r15_thread); 1587 __ movptr(c_rarg1, Address(rsp, 0)); // return address 1588 1589 // rax still holds the original exception oop, save it before the call 1590 __ push(rax); 1591 1592 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2); 1593 __ movptr(rbx, rax); 1594 1595 // Continue at exception handler: 1596 // rax: exception oop 1597 // rbx: exception handler 1598 // rdx: exception pc 1599 __ pop(rax); 1600 __ verify_oop(rax); 1601 __ pop(rdx); 1602 __ jmp(rbx); 1603 } 1604 1605 static void gen_continuation_yield(MacroAssembler* masm, 1606 const VMRegPair* regs, 1607 OopMapSet* oop_maps, 1608 int& frame_complete, 1609 int& stack_slots, 1610 int& compiled_entry_offset) { 1611 enum layout { 1612 rbp_off, 1613 rbpH_off, 1614 return_off, 1615 return_off2, 1616 framesize // inclusive of return address 1617 }; 1618 stack_slots = framesize / VMRegImpl::slots_per_word; 1619 assert(stack_slots == 2, "recheck layout"); 1620 1621 address start = __ pc(); 1622 compiled_entry_offset = __ pc() - start; 1623 __ enter(); 1624 address the_pc = __ pc(); 1625 1626 frame_complete = the_pc - start; 1627 1628 // This nop must be exactly at the PC we push into the frame info. 1629 // We use this nop for fast CodeBlob lookup, associate the OopMap 1630 // with it right away. 1631 __ post_call_nop(); 1632 OopMap* map = new OopMap(framesize, 1); 1633 oop_maps->add_gc_map(frame_complete, map); 1634 1635 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 1636 __ movptr(c_rarg0, r15_thread); 1637 __ movptr(c_rarg1, rsp); 1638 __ call_VM_leaf(Continuation::freeze_entry(), 2); 1639 __ reset_last_Java_frame(true); 1640 1641 Label L_pinned; 1642 1643 __ testptr(rax, rax); 1644 __ jcc(Assembler::notZero, L_pinned); 1645 1646 __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1647 continuation_enter_cleanup(masm); 1648 __ pop(rbp); 1649 __ ret(0); 1650 1651 __ bind(L_pinned); 1652 1653 // Pinned, return to caller 1654 1655 // handle pending exception thrown by freeze 1656 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 1657 Label ok; 1658 __ jcc(Assembler::equal, ok); 1659 __ leave(); 1660 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 1661 __ bind(ok); 1662 1663 __ leave(); 1664 __ ret(0); 1665 } 1666 1667 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) { 1668 ::continuation_enter_cleanup(masm); 1669 } 1670 1671 static void gen_special_dispatch(MacroAssembler* masm, 1672 const methodHandle& method, 1673 const BasicType* sig_bt, 1674 const VMRegPair* regs) { 1675 verify_oop_args(masm, method, sig_bt, regs); 1676 vmIntrinsics::ID iid = method->intrinsic_id(); 1677 1678 // Now write the args into the outgoing interpreter space 1679 bool has_receiver = false; 1680 Register receiver_reg = noreg; 1681 int member_arg_pos = -1; 1682 Register member_reg = noreg; 1683 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid); 1684 if (ref_kind != 0) { 1685 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument 1686 member_reg = rbx; // known to be free at this point 1687 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind); 1688 } else if (iid == vmIntrinsics::_invokeBasic) { 1689 has_receiver = true; 1690 } else if (iid == vmIntrinsics::_linkToNative) { 1691 member_arg_pos = method->size_of_parameters() - 1; // trailing NativeEntryPoint argument 1692 member_reg = rbx; // known to be free at this point 1693 } else { 1694 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid)); 1695 } 1696 1697 if (member_reg != noreg) { 1698 // Load the member_arg into register, if necessary. 1699 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs); 1700 VMReg r = regs[member_arg_pos].first(); 1701 if (r->is_stack()) { 1702 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1703 } else { 1704 // no data motion is needed 1705 member_reg = r->as_Register(); 1706 } 1707 } 1708 1709 if (has_receiver) { 1710 // Make sure the receiver is loaded into a register. 1711 assert(method->size_of_parameters() > 0, "oob"); 1712 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object"); 1713 VMReg r = regs[0].first(); 1714 assert(r->is_valid(), "bad receiver arg"); 1715 if (r->is_stack()) { 1716 // Porting note: This assumes that compiled calling conventions always 1717 // pass the receiver oop in a register. If this is not true on some 1718 // platform, pick a temp and load the receiver from stack. 1719 fatal("receiver always in a register"); 1720 receiver_reg = j_rarg0; // known to be free at this point 1721 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1722 } else { 1723 // no data motion is needed 1724 receiver_reg = r->as_Register(); 1725 } 1726 } 1727 1728 // Figure out which address we are really jumping to: 1729 MethodHandles::generate_method_handle_dispatch(masm, iid, 1730 receiver_reg, member_reg, /*for_compiler_entry:*/ true); 1731 } 1732 1733 // --------------------------------------------------------------------------- 1734 // Generate a native wrapper for a given method. The method takes arguments 1735 // in the Java compiled code convention, marshals them to the native 1736 // convention (handlizes oops, etc), transitions to native, makes the call, 1737 // returns to java state (possibly blocking), unhandlizes any result and 1738 // returns. 1739 // 1740 // Critical native functions are a shorthand for the use of 1741 // GetPrimtiveArrayCritical and disallow the use of any other JNI 1742 // functions. The wrapper is expected to unpack the arguments before 1743 // passing them to the callee. Critical native functions leave the state _in_Java, 1744 // since they cannot stop for GC. 1745 // Some other parts of JNI setup are skipped like the tear down of the JNI handle 1746 // block and the check for pending exceptions it's impossible for them 1747 // to be thrown. 1748 // 1749 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm, 1750 const methodHandle& method, 1751 int compile_id, 1752 BasicType* in_sig_bt, 1753 VMRegPair* in_regs, 1754 BasicType ret_type) { 1755 if (method->is_continuation_native_intrinsic()) { 1756 int exception_offset = -1; 1757 OopMapSet* oop_maps = new OopMapSet(); 1758 int frame_complete = -1; 1759 int stack_slots = -1; 1760 int interpreted_entry_offset = -1; 1761 int vep_offset = -1; 1762 if (method->is_continuation_enter_intrinsic()) { 1763 gen_continuation_enter(masm, 1764 in_regs, 1765 exception_offset, 1766 oop_maps, 1767 frame_complete, 1768 stack_slots, 1769 interpreted_entry_offset, 1770 vep_offset); 1771 } else if (method->is_continuation_yield_intrinsic()) { 1772 gen_continuation_yield(masm, 1773 in_regs, 1774 oop_maps, 1775 frame_complete, 1776 stack_slots, 1777 vep_offset); 1778 } else { 1779 guarantee(false, "Unknown Continuation native intrinsic"); 1780 } 1781 1782 #ifdef ASSERT 1783 if (method->is_continuation_enter_intrinsic()) { 1784 assert(interpreted_entry_offset != -1, "Must be set"); 1785 assert(exception_offset != -1, "Must be set"); 1786 } else { 1787 assert(interpreted_entry_offset == -1, "Must be unset"); 1788 assert(exception_offset == -1, "Must be unset"); 1789 } 1790 assert(frame_complete != -1, "Must be set"); 1791 assert(stack_slots != -1, "Must be set"); 1792 assert(vep_offset != -1, "Must be set"); 1793 #endif 1794 1795 __ flush(); 1796 nmethod* nm = nmethod::new_native_nmethod(method, 1797 compile_id, 1798 masm->code(), 1799 vep_offset, 1800 frame_complete, 1801 stack_slots, 1802 in_ByteSize(-1), 1803 in_ByteSize(-1), 1804 oop_maps, 1805 exception_offset); 1806 if (nm == nullptr) return nm; 1807 if (method->is_continuation_enter_intrinsic()) { 1808 ContinuationEntry::set_enter_code(nm, interpreted_entry_offset); 1809 } else if (method->is_continuation_yield_intrinsic()) { 1810 _cont_doYield_stub = nm; 1811 } 1812 return nm; 1813 } 1814 1815 if (method->is_method_handle_intrinsic()) { 1816 vmIntrinsics::ID iid = method->intrinsic_id(); 1817 intptr_t start = (intptr_t)__ pc(); 1818 int vep_offset = ((intptr_t)__ pc()) - start; 1819 gen_special_dispatch(masm, 1820 method, 1821 in_sig_bt, 1822 in_regs); 1823 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period 1824 __ flush(); 1825 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually 1826 return nmethod::new_native_nmethod(method, 1827 compile_id, 1828 masm->code(), 1829 vep_offset, 1830 frame_complete, 1831 stack_slots / VMRegImpl::slots_per_word, 1832 in_ByteSize(-1), 1833 in_ByteSize(-1), 1834 nullptr); 1835 } 1836 address native_func = method->native_function(); 1837 assert(native_func != nullptr, "must have function"); 1838 1839 // An OopMap for lock (and class if static) 1840 OopMapSet *oop_maps = new OopMapSet(); 1841 intptr_t start = (intptr_t)__ pc(); 1842 1843 // We have received a description of where all the java arg are located 1844 // on entry to the wrapper. We need to convert these args to where 1845 // the jni function will expect them. To figure out where they go 1846 // we convert the java signature to a C signature by inserting 1847 // the hidden arguments as arg[0] and possibly arg[1] (static method) 1848 1849 const int total_in_args = method->size_of_parameters(); 1850 int total_c_args = total_in_args + (method->is_static() ? 2 : 1); 1851 1852 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args); 1853 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args); 1854 1855 int argc = 0; 1856 out_sig_bt[argc++] = T_ADDRESS; 1857 if (method->is_static()) { 1858 out_sig_bt[argc++] = T_OBJECT; 1859 } 1860 1861 for (int i = 0; i < total_in_args ; i++ ) { 1862 out_sig_bt[argc++] = in_sig_bt[i]; 1863 } 1864 1865 // Now figure out where the args must be stored and how much stack space 1866 // they require. 1867 int out_arg_slots; 1868 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args); 1869 1870 // Compute framesize for the wrapper. We need to handlize all oops in 1871 // incoming registers 1872 1873 // Calculate the total number of stack slots we will need. 1874 1875 // First count the abi requirement plus all of the outgoing args 1876 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots; 1877 1878 // Now the space for the inbound oop handle area 1879 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers 1880 1881 int oop_handle_offset = stack_slots; 1882 stack_slots += total_save_slots; 1883 1884 // Now any space we need for handlizing a klass if static method 1885 1886 int klass_slot_offset = 0; 1887 int klass_offset = -1; 1888 int lock_slot_offset = 0; 1889 bool is_static = false; 1890 1891 if (method->is_static()) { 1892 klass_slot_offset = stack_slots; 1893 stack_slots += VMRegImpl::slots_per_word; 1894 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size; 1895 is_static = true; 1896 } 1897 1898 // Plus a lock if needed 1899 1900 if (method->is_synchronized()) { 1901 lock_slot_offset = stack_slots; 1902 stack_slots += VMRegImpl::slots_per_word; 1903 } 1904 1905 // Now a place (+2) to save return values or temp during shuffling 1906 // + 4 for return address (which we own) and saved rbp 1907 stack_slots += 6; 1908 1909 // Ok The space we have allocated will look like: 1910 // 1911 // 1912 // FP-> | | 1913 // |---------------------| 1914 // | 2 slots for moves | 1915 // |---------------------| 1916 // | lock box (if sync) | 1917 // |---------------------| <- lock_slot_offset 1918 // | klass (if static) | 1919 // |---------------------| <- klass_slot_offset 1920 // | oopHandle area | 1921 // |---------------------| <- oop_handle_offset (6 java arg registers) 1922 // | outbound memory | 1923 // | based arguments | 1924 // | | 1925 // |---------------------| 1926 // | | 1927 // SP-> | out_preserved_slots | 1928 // 1929 // 1930 1931 1932 // Now compute actual number of stack words we need rounding to make 1933 // stack properly aligned. 1934 stack_slots = align_up(stack_slots, StackAlignmentInSlots); 1935 1936 int stack_size = stack_slots * VMRegImpl::stack_slot_size; 1937 1938 // First thing make an ic check to see if we should even be here 1939 1940 // We are free to use all registers as temps without saving them and 1941 // restoring them except rbp. rbp is the only callee save register 1942 // as far as the interpreter and the compiler(s) are concerned. 1943 1944 const Register receiver = j_rarg0; 1945 1946 Label exception_pending; 1947 1948 assert_different_registers(receiver, rscratch1, rscratch2); 1949 __ verify_oop(receiver); 1950 __ ic_check(8 /* end_alignment */); 1951 1952 int vep_offset = ((intptr_t)__ pc()) - start; 1953 1954 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) { 1955 Label L_skip_barrier; 1956 Register klass = r10; 1957 __ mov_metadata(klass, method->method_holder()); // InstanceKlass* 1958 __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/); 1959 1960 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1961 1962 __ bind(L_skip_barrier); 1963 } 1964 1965 #ifdef COMPILER1 1966 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available. 1967 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) { 1968 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/); 1969 } 1970 #endif // COMPILER1 1971 1972 // The instruction at the verified entry point must be 5 bytes or longer 1973 // because it can be patched on the fly by make_non_entrant. The stack bang 1974 // instruction fits that requirement. 1975 1976 // Generate stack overflow check 1977 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size()); 1978 1979 // Generate a new frame for the wrapper. 1980 __ enter(); 1981 // -2 because return address is already present and so is saved rbp 1982 __ subptr(rsp, stack_size - 2*wordSize); 1983 1984 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1985 // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub 1986 bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */); 1987 1988 // Frame is now completed as far as size and linkage. 1989 int frame_complete = ((intptr_t)__ pc()) - start; 1990 1991 #ifdef ASSERT 1992 __ check_stack_alignment(rsp, "improperly aligned stack"); 1993 #endif /* ASSERT */ 1994 1995 1996 // We use r14 as the oop handle for the receiver/klass 1997 // It is callee save so it survives the call to native 1998 1999 const Register oop_handle_reg = r14; 2000 2001 // 2002 // We immediately shuffle the arguments so that any vm call we have to 2003 // make from here on out (sync slow path, jvmti, etc.) we will have 2004 // captured the oops from our caller and have a valid oopMap for 2005 // them. 2006 2007 // ----------------- 2008 // The Grand Shuffle 2009 2010 // The Java calling convention is either equal (linux) or denser (win64) than the 2011 // c calling convention. However the because of the jni_env argument the c calling 2012 // convention always has at least one more (and two for static) arguments than Java. 2013 // Therefore if we move the args from java -> c backwards then we will never have 2014 // a register->register conflict and we don't have to build a dependency graph 2015 // and figure out how to break any cycles. 2016 // 2017 2018 // Record esp-based slot for receiver on stack for non-static methods 2019 int receiver_offset = -1; 2020 2021 // This is a trick. We double the stack slots so we can claim 2022 // the oops in the caller's frame. Since we are sure to have 2023 // more args than the caller doubling is enough to make 2024 // sure we can capture all the incoming oop args from the 2025 // caller. 2026 // 2027 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/); 2028 2029 // Mark location of rbp (someday) 2030 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp)); 2031 2032 // Use eax, ebx as temporaries during any memory-memory moves we have to do 2033 // All inbound args are referenced based on rbp and all outbound args via rsp. 2034 2035 2036 #ifdef ASSERT 2037 bool reg_destroyed[Register::number_of_registers]; 2038 bool freg_destroyed[XMMRegister::number_of_registers]; 2039 for ( int r = 0 ; r < Register::number_of_registers ; r++ ) { 2040 reg_destroyed[r] = false; 2041 } 2042 for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) { 2043 freg_destroyed[f] = false; 2044 } 2045 2046 #endif /* ASSERT */ 2047 2048 // For JNI natives the incoming and outgoing registers are offset upwards. 2049 GrowableArray<int> arg_order(2 * total_in_args); 2050 2051 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 2052 arg_order.push(i); 2053 arg_order.push(c_arg); 2054 } 2055 2056 for (int ai = 0; ai < arg_order.length(); ai += 2) { 2057 int i = arg_order.at(ai); 2058 int c_arg = arg_order.at(ai + 1); 2059 __ block_comment(err_msg("move %d -> %d", i, c_arg)); 2060 #ifdef ASSERT 2061 if (in_regs[i].first()->is_Register()) { 2062 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!"); 2063 } else if (in_regs[i].first()->is_XMMRegister()) { 2064 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!"); 2065 } 2066 if (out_regs[c_arg].first()->is_Register()) { 2067 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 2068 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 2069 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 2070 } 2071 #endif /* ASSERT */ 2072 switch (in_sig_bt[i]) { 2073 case T_ARRAY: 2074 case T_OBJECT: 2075 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg], 2076 ((i == 0) && (!is_static)), 2077 &receiver_offset); 2078 break; 2079 case T_VOID: 2080 break; 2081 2082 case T_FLOAT: 2083 __ float_move(in_regs[i], out_regs[c_arg]); 2084 break; 2085 2086 case T_DOUBLE: 2087 assert( i + 1 < total_in_args && 2088 in_sig_bt[i + 1] == T_VOID && 2089 out_sig_bt[c_arg+1] == T_VOID, "bad arg list"); 2090 __ double_move(in_regs[i], out_regs[c_arg]); 2091 break; 2092 2093 case T_LONG : 2094 __ long_move(in_regs[i], out_regs[c_arg]); 2095 break; 2096 2097 case T_ADDRESS: assert(false, "found T_ADDRESS in java args"); 2098 2099 default: 2100 __ move32_64(in_regs[i], out_regs[c_arg]); 2101 } 2102 } 2103 2104 int c_arg; 2105 2106 // Pre-load a static method's oop into r14. Used both by locking code and 2107 // the normal JNI call code. 2108 // point c_arg at the first arg that is already loaded in case we 2109 // need to spill before we call out 2110 c_arg = total_c_args - total_in_args; 2111 2112 if (method->is_static()) { 2113 2114 // load oop into a register 2115 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror())); 2116 2117 // Now handlize the static class mirror it's known not-null. 2118 __ movptr(Address(rsp, klass_offset), oop_handle_reg); 2119 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset)); 2120 2121 // Now get the handle 2122 __ lea(oop_handle_reg, Address(rsp, klass_offset)); 2123 // store the klass handle as second argument 2124 __ movptr(c_rarg1, oop_handle_reg); 2125 // and protect the arg if we must spill 2126 c_arg--; 2127 } 2128 2129 // Change state to native (we save the return address in the thread, since it might not 2130 // be pushed on the stack when we do a stack traversal). It is enough that the pc() 2131 // points into the right code segment. It does not have to be the correct return pc. 2132 // We use the same pc/oopMap repeatedly when we call out 2133 2134 Label native_return; 2135 if (LockingMode != LM_LEGACY && method->is_object_wait0()) { 2136 // For convenience we use the pc we want to resume to in case of preemption on Object.wait. 2137 __ set_last_Java_frame(rsp, noreg, native_return, rscratch1); 2138 } else { 2139 intptr_t the_pc = (intptr_t) __ pc(); 2140 oop_maps->add_gc_map(the_pc - start, map); 2141 2142 __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1); 2143 } 2144 2145 // We have all of the arguments setup at this point. We must not touch any register 2146 // argument registers at this point (what if we save/restore them there are no oop? 2147 2148 if (DTraceMethodProbes) { 2149 // protect the args we've loaded 2150 save_args(masm, total_c_args, c_arg, out_regs); 2151 __ mov_metadata(c_rarg1, method()); 2152 __ call_VM_leaf( 2153 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry), 2154 r15_thread, c_rarg1); 2155 restore_args(masm, total_c_args, c_arg, out_regs); 2156 } 2157 2158 // RedefineClasses() tracing support for obsolete method entry 2159 if (log_is_enabled(Trace, redefine, class, obsolete)) { 2160 // protect the args we've loaded 2161 save_args(masm, total_c_args, c_arg, out_regs); 2162 __ mov_metadata(c_rarg1, method()); 2163 __ call_VM_leaf( 2164 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry), 2165 r15_thread, c_rarg1); 2166 restore_args(masm, total_c_args, c_arg, out_regs); 2167 } 2168 2169 // Lock a synchronized method 2170 2171 // Register definitions used by locking and unlocking 2172 2173 const Register swap_reg = rax; // Must use rax for cmpxchg instruction 2174 const Register obj_reg = rbx; // Will contain the oop 2175 const Register lock_reg = r13; // Address of compiler lock object (BasicLock) 2176 const Register old_hdr = r13; // value of old header at unlock time 2177 2178 Label slow_path_lock; 2179 Label lock_done; 2180 2181 if (method->is_synchronized()) { 2182 Label count_mon; 2183 2184 const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes(); 2185 2186 // Get the handle (the 2nd argument) 2187 __ mov(oop_handle_reg, c_rarg1); 2188 2189 // Get address of the box 2190 2191 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2192 2193 // Load the oop from the handle 2194 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2195 2196 if (LockingMode == LM_MONITOR) { 2197 __ jmp(slow_path_lock); 2198 } else if (LockingMode == LM_LEGACY) { 2199 // Load immediate 1 into swap_reg %rax 2200 __ movl(swap_reg, 1); 2201 2202 // Load (object->mark() | 1) into swap_reg %rax 2203 __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2204 2205 // Save (object->mark() | 1) into BasicLock's displaced header 2206 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2207 2208 // src -> dest iff dest == rax else rax <- dest 2209 __ lock(); 2210 __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2211 __ jcc(Assembler::equal, count_mon); 2212 2213 // Hmm should this move to the slow path code area??? 2214 2215 // Test if the oopMark is an obvious stack pointer, i.e., 2216 // 1) (mark & 3) == 0, and 2217 // 2) rsp <= mark < mark + os::pagesize() 2218 // These 3 tests can be done by evaluating the following 2219 // expression: ((mark - rsp) & (3 - os::vm_page_size())), 2220 // assuming both stack pointer and pagesize have their 2221 // least significant 2 bits clear. 2222 // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg 2223 2224 __ subptr(swap_reg, rsp); 2225 __ andptr(swap_reg, 3 - (int)os::vm_page_size()); 2226 2227 // Save the test result, for recursive case, the result is zero 2228 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2229 __ jcc(Assembler::notEqual, slow_path_lock); 2230 2231 __ bind(count_mon); 2232 __ inc_held_monitor_count(); 2233 } else { 2234 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2235 __ lightweight_lock(lock_reg, obj_reg, swap_reg, rscratch1, slow_path_lock); 2236 } 2237 2238 // Slow path will re-enter here 2239 __ bind(lock_done); 2240 } 2241 2242 // Finally just about ready to make the JNI call 2243 2244 // get JNIEnv* which is first argument to native 2245 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset()))); 2246 2247 // Now set thread in native 2248 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 2249 2250 __ call(RuntimeAddress(native_func)); 2251 2252 // Verify or restore cpu control state after JNI call 2253 __ restore_cpu_control_state_after_jni(rscratch1); 2254 2255 // Unpack native results. 2256 switch (ret_type) { 2257 case T_BOOLEAN: __ c2bool(rax); break; 2258 case T_CHAR : __ movzwl(rax, rax); break; 2259 case T_BYTE : __ sign_extend_byte (rax); break; 2260 case T_SHORT : __ sign_extend_short(rax); break; 2261 case T_INT : /* nothing to do */ break; 2262 case T_DOUBLE : 2263 case T_FLOAT : 2264 // Result is in xmm0 we'll save as needed 2265 break; 2266 case T_ARRAY: // Really a handle 2267 case T_OBJECT: // Really a handle 2268 break; // can't de-handlize until after safepoint check 2269 case T_VOID: break; 2270 case T_LONG: break; 2271 default : ShouldNotReachHere(); 2272 } 2273 2274 // Switch thread to "native transition" state before reading the synchronization state. 2275 // This additional state is necessary because reading and testing the synchronization 2276 // state is not atomic w.r.t. GC, as this scenario demonstrates: 2277 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted. 2278 // VM thread changes sync state to synchronizing and suspends threads for GC. 2279 // Thread A is resumed to finish this native method, but doesn't block here since it 2280 // didn't see any synchronization is progress, and escapes. 2281 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 2282 2283 // Force this write out before the read below 2284 if (!UseSystemMemoryBarrier) { 2285 __ membar(Assembler::Membar_mask_bits( 2286 Assembler::LoadLoad | Assembler::LoadStore | 2287 Assembler::StoreLoad | Assembler::StoreStore)); 2288 } 2289 2290 // check for safepoint operation in progress and/or pending suspend requests 2291 { 2292 Label Continue; 2293 Label slow_path; 2294 2295 __ safepoint_poll(slow_path, true /* at_return */, false /* in_nmethod */); 2296 2297 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2298 __ jcc(Assembler::equal, Continue); 2299 __ bind(slow_path); 2300 2301 // Don't use call_VM as it will see a possible pending exception and forward it 2302 // and never return here preventing us from clearing _last_native_pc down below. 2303 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are 2304 // preserved and correspond to the bcp/locals pointers. So we do a runtime call 2305 // by hand. 2306 // 2307 __ vzeroupper(); 2308 save_native_result(masm, ret_type, stack_slots); 2309 __ mov(c_rarg0, r15_thread); 2310 __ mov(r12, rsp); // remember sp 2311 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2312 __ andptr(rsp, -16); // align stack as required by ABI 2313 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 2314 __ mov(rsp, r12); // restore sp 2315 __ reinit_heapbase(); 2316 // Restore any method result value 2317 restore_native_result(masm, ret_type, stack_slots); 2318 __ bind(Continue); 2319 } 2320 2321 // change thread state 2322 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 2323 2324 if (LockingMode != LM_LEGACY && method->is_object_wait0()) { 2325 // Check preemption for Object.wait() 2326 __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset())); 2327 __ cmpptr(rscratch1, NULL_WORD); 2328 __ jccb(Assembler::equal, native_return); 2329 __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD); 2330 __ jmp(rscratch1); 2331 __ bind(native_return); 2332 2333 intptr_t the_pc = (intptr_t) __ pc(); 2334 oop_maps->add_gc_map(the_pc - start, map); 2335 } 2336 2337 2338 Label reguard; 2339 Label reguard_done; 2340 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 2341 __ jcc(Assembler::equal, reguard); 2342 __ bind(reguard_done); 2343 2344 // native result if any is live 2345 2346 // Unlock 2347 Label slow_path_unlock; 2348 Label unlock_done; 2349 if (method->is_synchronized()) { 2350 2351 Label fast_done; 2352 2353 // Get locked oop from the handle we passed to jni 2354 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2355 2356 if (LockingMode == LM_LEGACY) { 2357 Label not_recur; 2358 // Simple recursive lock? 2359 __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD); 2360 __ jcc(Assembler::notEqual, not_recur); 2361 __ dec_held_monitor_count(); 2362 __ jmpb(fast_done); 2363 __ bind(not_recur); 2364 } 2365 2366 // Must save rax if it is live now because cmpxchg must use it 2367 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2368 save_native_result(masm, ret_type, stack_slots); 2369 } 2370 2371 if (LockingMode == LM_MONITOR) { 2372 __ jmp(slow_path_unlock); 2373 } else if (LockingMode == LM_LEGACY) { 2374 // get address of the stack lock 2375 __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2376 // get old displaced header 2377 __ movptr(old_hdr, Address(rax, 0)); 2378 2379 // Atomic swap old header if oop still contains the stack lock 2380 __ lock(); 2381 __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2382 __ jcc(Assembler::notEqual, slow_path_unlock); 2383 __ dec_held_monitor_count(); 2384 } else { 2385 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2386 __ lightweight_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock); 2387 } 2388 2389 // slow path re-enters here 2390 __ bind(unlock_done); 2391 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2392 restore_native_result(masm, ret_type, stack_slots); 2393 } 2394 2395 __ bind(fast_done); 2396 } 2397 if (DTraceMethodProbes) { 2398 save_native_result(masm, ret_type, stack_slots); 2399 __ mov_metadata(c_rarg1, method()); 2400 __ call_VM_leaf( 2401 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit), 2402 r15_thread, c_rarg1); 2403 restore_native_result(masm, ret_type, stack_slots); 2404 } 2405 2406 __ reset_last_Java_frame(false); 2407 2408 // Unbox oop result, e.g. JNIHandles::resolve value. 2409 if (is_reference_type(ret_type)) { 2410 __ resolve_jobject(rax /* value */, 2411 rcx /* tmp */); 2412 } 2413 2414 if (CheckJNICalls) { 2415 // clear_pending_jni_exception_check 2416 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD); 2417 } 2418 2419 // reset handle block 2420 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset())); 2421 __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD); 2422 2423 // pop our frame 2424 2425 __ leave(); 2426 2427 // Any exception pending? 2428 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2429 __ jcc(Assembler::notEqual, exception_pending); 2430 2431 // Return 2432 2433 __ ret(0); 2434 2435 // Unexpected paths are out of line and go here 2436 2437 // forward the exception 2438 __ bind(exception_pending); 2439 2440 // and forward the exception 2441 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 2442 2443 // Slow path locking & unlocking 2444 if (method->is_synchronized()) { 2445 2446 // BEGIN Slow path lock 2447 __ bind(slow_path_lock); 2448 2449 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM 2450 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2451 2452 // protect the args we've loaded 2453 save_args(masm, total_c_args, c_arg, out_regs); 2454 2455 __ mov(c_rarg0, obj_reg); 2456 __ mov(c_rarg1, lock_reg); 2457 __ mov(c_rarg2, r15_thread); 2458 2459 // Not a leaf but we have last_Java_frame setup as we want. 2460 // We don't want to unmount in case of contention since that would complicate preserving 2461 // the arguments that had already been marshalled into the native convention. So we force 2462 // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame()) 2463 // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack. 2464 __ push_cont_fastpath(); 2465 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3); 2466 __ pop_cont_fastpath(); 2467 restore_args(masm, total_c_args, c_arg, out_regs); 2468 2469 #ifdef ASSERT 2470 { Label L; 2471 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2472 __ jcc(Assembler::equal, L); 2473 __ stop("no pending exception allowed on exit from monitorenter"); 2474 __ bind(L); 2475 } 2476 #endif 2477 __ jmp(lock_done); 2478 2479 // END Slow path lock 2480 2481 // BEGIN Slow path unlock 2482 __ bind(slow_path_unlock); 2483 2484 // If we haven't already saved the native result we must save it now as xmm registers 2485 // are still exposed. 2486 __ vzeroupper(); 2487 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2488 save_native_result(masm, ret_type, stack_slots); 2489 } 2490 2491 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2492 2493 __ mov(c_rarg0, obj_reg); 2494 __ mov(c_rarg2, r15_thread); 2495 __ mov(r12, rsp); // remember sp 2496 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2497 __ andptr(rsp, -16); // align stack as required by ABI 2498 2499 // Save pending exception around call to VM (which contains an EXCEPTION_MARK) 2500 // NOTE that obj_reg == rbx currently 2501 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset()))); 2502 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2503 2504 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2505 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C))); 2506 __ mov(rsp, r12); // restore sp 2507 __ reinit_heapbase(); 2508 #ifdef ASSERT 2509 { 2510 Label L; 2511 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2512 __ jcc(Assembler::equal, L); 2513 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C"); 2514 __ bind(L); 2515 } 2516 #endif /* ASSERT */ 2517 2518 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx); 2519 2520 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2521 restore_native_result(masm, ret_type, stack_slots); 2522 } 2523 __ jmp(unlock_done); 2524 2525 // END Slow path unlock 2526 2527 } // synchronized 2528 2529 // SLOW PATH Reguard the stack if needed 2530 2531 __ bind(reguard); 2532 __ vzeroupper(); 2533 save_native_result(masm, ret_type, stack_slots); 2534 __ mov(r12, rsp); // remember sp 2535 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2536 __ andptr(rsp, -16); // align stack as required by ABI 2537 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 2538 __ mov(rsp, r12); // restore sp 2539 __ reinit_heapbase(); 2540 restore_native_result(masm, ret_type, stack_slots); 2541 // and continue 2542 __ jmp(reguard_done); 2543 2544 2545 2546 __ flush(); 2547 2548 nmethod *nm = nmethod::new_native_nmethod(method, 2549 compile_id, 2550 masm->code(), 2551 vep_offset, 2552 frame_complete, 2553 stack_slots / VMRegImpl::slots_per_word, 2554 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)), 2555 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size), 2556 oop_maps); 2557 2558 return nm; 2559 } 2560 2561 // this function returns the adjust size (in number of words) to a c2i adapter 2562 // activation for use during deoptimization 2563 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) { 2564 return (callee_locals - callee_parameters) * Interpreter::stackElementWords; 2565 } 2566 2567 2568 uint SharedRuntime::out_preserve_stack_slots() { 2569 return 0; 2570 } 2571 2572 2573 // Number of stack slots between incoming argument block and the start of 2574 // a new frame. The PROLOG must add this many slots to the stack. The 2575 // EPILOG must remove this many slots. amd64 needs two slots for 2576 // return address. 2577 uint SharedRuntime::in_preserve_stack_slots() { 2578 return 4 + 2 * VerifyStackAtCalls; 2579 } 2580 2581 VMReg SharedRuntime::thread_register() { 2582 return r15_thread->as_VMReg(); 2583 } 2584 2585 //------------------------------generate_deopt_blob---------------------------- 2586 void SharedRuntime::generate_deopt_blob() { 2587 // Allocate space for the code 2588 ResourceMark rm; 2589 // Setup code generation tools 2590 int pad = 0; 2591 if (UseAVX > 2) { 2592 pad += 1024; 2593 } 2594 if (UseAPX) { 2595 pad += 1024; 2596 } 2597 #if INCLUDE_JVMCI 2598 if (EnableJVMCI) { 2599 pad += 512; // Increase the buffer size when compiling for JVMCI 2600 } 2601 #endif 2602 const char* name = SharedRuntime::stub_name(SharedStubId::deopt_id); 2603 CodeBuffer buffer(name, 2560+pad, 1024); 2604 MacroAssembler* masm = new MacroAssembler(&buffer); 2605 int frame_size_in_words; 2606 OopMap* map = nullptr; 2607 OopMapSet *oop_maps = new OopMapSet(); 2608 2609 // ------------- 2610 // This code enters when returning to a de-optimized nmethod. A return 2611 // address has been pushed on the stack, and return values are in 2612 // registers. 2613 // If we are doing a normal deopt then we were called from the patched 2614 // nmethod from the point we returned to the nmethod. So the return 2615 // address on the stack is wrong by NativeCall::instruction_size 2616 // We will adjust the value so it looks like we have the original return 2617 // address on the stack (like when we eagerly deoptimized). 2618 // In the case of an exception pending when deoptimizing, we enter 2619 // with a return address on the stack that points after the call we patched 2620 // into the exception handler. We have the following register state from, 2621 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp). 2622 // rax: exception oop 2623 // rbx: exception handler 2624 // rdx: throwing pc 2625 // So in this case we simply jam rdx into the useless return address and 2626 // the stack looks just like we want. 2627 // 2628 // At this point we need to de-opt. We save the argument return 2629 // registers. We call the first C routine, fetch_unroll_info(). This 2630 // routine captures the return values and returns a structure which 2631 // describes the current frame size and the sizes of all replacement frames. 2632 // The current frame is compiled code and may contain many inlined 2633 // functions, each with their own JVM state. We pop the current frame, then 2634 // push all the new frames. Then we call the C routine unpack_frames() to 2635 // populate these frames. Finally unpack_frames() returns us the new target 2636 // address. Notice that callee-save registers are BLOWN here; they have 2637 // already been captured in the vframeArray at the time the return PC was 2638 // patched. 2639 address start = __ pc(); 2640 Label cont; 2641 2642 // Prolog for non exception case! 2643 2644 // Save everything in sight. 2645 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2646 2647 // Normal deoptimization. Save exec mode for unpack_frames. 2648 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved 2649 __ jmp(cont); 2650 2651 int reexecute_offset = __ pc() - start; 2652 #if INCLUDE_JVMCI && !defined(COMPILER1) 2653 if (UseJVMCICompiler) { 2654 // JVMCI does not use this kind of deoptimization 2655 __ should_not_reach_here(); 2656 } 2657 #endif 2658 2659 // Reexecute case 2660 // return address is the pc describes what bci to do re-execute at 2661 2662 // No need to update map as each call to save_live_registers will produce identical oopmap 2663 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2664 2665 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved 2666 __ jmp(cont); 2667 2668 #if INCLUDE_JVMCI 2669 Label after_fetch_unroll_info_call; 2670 int implicit_exception_uncommon_trap_offset = 0; 2671 int uncommon_trap_offset = 0; 2672 2673 if (EnableJVMCI) { 2674 implicit_exception_uncommon_trap_offset = __ pc() - start; 2675 2676 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset()))); 2677 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD); 2678 2679 uncommon_trap_offset = __ pc() - start; 2680 2681 // Save everything in sight. 2682 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2683 // fetch_unroll_info needs to call last_java_frame() 2684 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2685 2686 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset()))); 2687 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1); 2688 2689 __ movl(r14, Deoptimization::Unpack_reexecute); 2690 __ mov(c_rarg0, r15_thread); 2691 __ movl(c_rarg2, r14); // exec mode 2692 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2693 oop_maps->add_gc_map( __ pc()-start, map->deep_copy()); 2694 2695 __ reset_last_Java_frame(false); 2696 2697 __ jmp(after_fetch_unroll_info_call); 2698 } // EnableJVMCI 2699 #endif // INCLUDE_JVMCI 2700 2701 int exception_offset = __ pc() - start; 2702 2703 // Prolog for exception case 2704 2705 // all registers are dead at this entry point, except for rax, and 2706 // rdx which contain the exception oop and exception pc 2707 // respectively. Set them in TLS and fall thru to the 2708 // unpack_with_exception_in_tls entry point. 2709 2710 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 2711 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax); 2712 2713 int exception_in_tls_offset = __ pc() - start; 2714 2715 // new implementation because exception oop is now passed in JavaThread 2716 2717 // Prolog for exception case 2718 // All registers must be preserved because they might be used by LinearScan 2719 // Exceptiop oop and throwing PC are passed in JavaThread 2720 // tos: stack at point of call to method that threw the exception (i.e. only 2721 // args are on the stack, no return address) 2722 2723 // make room on stack for the return address 2724 // It will be patched later with the throwing pc. The correct value is not 2725 // available now because loading it from memory would destroy registers. 2726 __ push(0); 2727 2728 // Save everything in sight. 2729 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2730 2731 // Now it is safe to overwrite any register 2732 2733 // Deopt during an exception. Save exec mode for unpack_frames. 2734 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved 2735 2736 // load throwing pc from JavaThread and patch it as the return address 2737 // of the current frame. Then clear the field in JavaThread 2738 2739 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2740 __ movptr(Address(rbp, wordSize), rdx); 2741 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2742 2743 #ifdef ASSERT 2744 // verify that there is really an exception oop in JavaThread 2745 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2746 __ verify_oop(rax); 2747 2748 // verify that there is no pending exception 2749 Label no_pending_exception; 2750 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 2751 __ testptr(rax, rax); 2752 __ jcc(Assembler::zero, no_pending_exception); 2753 __ stop("must not have pending exception here"); 2754 __ bind(no_pending_exception); 2755 #endif 2756 2757 __ bind(cont); 2758 2759 // Call C code. Need thread and this frame, but NOT official VM entry 2760 // crud. We cannot block on this call, no GC can happen. 2761 // 2762 // UnrollBlock* fetch_unroll_info(JavaThread* thread) 2763 2764 // fetch_unroll_info needs to call last_java_frame(). 2765 2766 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2767 #ifdef ASSERT 2768 { Label L; 2769 __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD); 2770 __ jcc(Assembler::equal, L); 2771 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared"); 2772 __ bind(L); 2773 } 2774 #endif // ASSERT 2775 __ mov(c_rarg0, r15_thread); 2776 __ movl(c_rarg1, r14); // exec_mode 2777 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info))); 2778 2779 // Need to have an oopmap that tells fetch_unroll_info where to 2780 // find any register it might need. 2781 oop_maps->add_gc_map(__ pc() - start, map); 2782 2783 __ reset_last_Java_frame(false); 2784 2785 #if INCLUDE_JVMCI 2786 if (EnableJVMCI) { 2787 __ bind(after_fetch_unroll_info_call); 2788 } 2789 #endif 2790 2791 // Load UnrollBlock* into rdi 2792 __ mov(rdi, rax); 2793 2794 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset())); 2795 Label noException; 2796 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending? 2797 __ jcc(Assembler::notEqual, noException); 2798 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2799 // QQQ this is useless it was null above 2800 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2801 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD); 2802 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2803 2804 __ verify_oop(rax); 2805 2806 // Overwrite the result registers with the exception results. 2807 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2808 // I think this is useless 2809 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx); 2810 2811 __ bind(noException); 2812 2813 // Only register save data is on the stack. 2814 // Now restore the result registers. Everything else is either dead 2815 // or captured in the vframeArray. 2816 RegisterSaver::restore_result_registers(masm); 2817 2818 // All of the register save area has been popped of the stack. Only the 2819 // return address remains. 2820 2821 // Pop all the frames we must move/replace. 2822 // 2823 // Frame picture (youngest to oldest) 2824 // 1: self-frame (no frame link) 2825 // 2: deopting frame (no frame link) 2826 // 3: caller of deopting frame (could be compiled/interpreted). 2827 // 2828 // Note: by leaving the return address of self-frame on the stack 2829 // and using the size of frame 2 to adjust the stack 2830 // when we are done the return to frame 3 will still be on the stack. 2831 2832 // Pop deoptimized frame 2833 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset())); 2834 __ addptr(rsp, rcx); 2835 2836 // rsp should be pointing at the return address to the caller (3) 2837 2838 // Pick up the initial fp we should save 2839 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2840 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset())); 2841 2842 #ifdef ASSERT 2843 // Compilers generate code that bang the stack by as much as the 2844 // interpreter would need. So this stack banging should never 2845 // trigger a fault. Verify that it does not on non product builds. 2846 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset())); 2847 __ bang_stack_size(rbx, rcx); 2848 #endif 2849 2850 // Load address of array of frame pcs into rcx 2851 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset())); 2852 2853 // Trash the old pc 2854 __ addptr(rsp, wordSize); 2855 2856 // Load address of array of frame sizes into rsi 2857 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset())); 2858 2859 // Load counter into rdx 2860 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset())); 2861 2862 // Now adjust the caller's stack to make up for the extra locals 2863 // but record the original sp so that we can save it in the skeletal interpreter 2864 // frame and the stack walking of interpreter_sender will get the unextended sp 2865 // value and not the "real" sp value. 2866 2867 const Register sender_sp = r8; 2868 2869 __ mov(sender_sp, rsp); 2870 __ movl(rbx, Address(rdi, 2871 Deoptimization::UnrollBlock:: 2872 caller_adjustment_offset())); 2873 __ subptr(rsp, rbx); 2874 2875 // Push interpreter frames in a loop 2876 Label loop; 2877 __ bind(loop); 2878 __ movptr(rbx, Address(rsi, 0)); // Load frame size 2879 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand 2880 __ pushptr(Address(rcx, 0)); // Save return address 2881 __ enter(); // Save old & set new ebp 2882 __ subptr(rsp, rbx); // Prolog 2883 // This value is corrected by layout_activation_impl 2884 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD); 2885 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable 2886 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 2887 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 2888 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 2889 __ decrementl(rdx); // Decrement counter 2890 __ jcc(Assembler::notZero, loop); 2891 __ pushptr(Address(rcx, 0)); // Save final return address 2892 2893 // Re-push self-frame 2894 __ enter(); // Save old & set new ebp 2895 2896 // Allocate a full sized register save area. 2897 // Return address and rbp are in place, so we allocate two less words. 2898 __ subptr(rsp, (frame_size_in_words - 2) * wordSize); 2899 2900 // Restore frame locals after moving the frame 2901 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0); 2902 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2903 2904 // Call C code. Need thread but NOT official VM entry 2905 // crud. We cannot block on this call, no GC can happen. Call should 2906 // restore return values to their stack-slots with the new SP. 2907 // 2908 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode) 2909 2910 // Use rbp because the frames look interpreted now 2911 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 2912 // Don't need the precise return PC here, just precise enough to point into this code blob. 2913 address the_pc = __ pc(); 2914 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1); 2915 2916 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI 2917 __ mov(c_rarg0, r15_thread); 2918 __ movl(c_rarg1, r14); // second arg: exec_mode 2919 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 2920 // Revert SP alignment after call since we're going to do some SP relative addressing below 2921 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset())); 2922 2923 // Set an oopmap for the call site 2924 // Use the same PC we used for the last java frame 2925 oop_maps->add_gc_map(the_pc - start, 2926 new OopMap( frame_size_in_words, 0 )); 2927 2928 // Clear fp AND pc 2929 __ reset_last_Java_frame(true); 2930 2931 // Collect return values 2932 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes())); 2933 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes())); 2934 // I think this is useless (throwing pc?) 2935 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes())); 2936 2937 // Pop self-frame. 2938 __ leave(); // Epilog 2939 2940 // Jump to interpreter 2941 __ ret(0); 2942 2943 // Make sure all code is generated 2944 masm->flush(); 2945 2946 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words); 2947 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset); 2948 #if INCLUDE_JVMCI 2949 if (EnableJVMCI) { 2950 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset); 2951 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset); 2952 } 2953 #endif 2954 } 2955 2956 //------------------------------generate_handler_blob------ 2957 // 2958 // Generate a special Compile2Runtime blob that saves all registers, 2959 // and setup oopmap. 2960 // 2961 SafepointBlob* SharedRuntime::generate_handler_blob(SharedStubId id, address call_ptr) { 2962 assert(StubRoutines::forward_exception_entry() != nullptr, 2963 "must be generated before"); 2964 assert(is_polling_page_id(id), "expected a polling page stub id"); 2965 2966 ResourceMark rm; 2967 OopMapSet *oop_maps = new OopMapSet(); 2968 OopMap* map; 2969 2970 // Allocate space for the code. Setup code generation tools. 2971 const char* name = SharedRuntime::stub_name(id); 2972 CodeBuffer buffer(name, 2548, 1024); 2973 MacroAssembler* masm = new MacroAssembler(&buffer); 2974 2975 address start = __ pc(); 2976 address call_pc = nullptr; 2977 int frame_size_in_words; 2978 bool cause_return = (id == SharedStubId::polling_page_return_handler_id); 2979 bool save_wide_vectors = (id == SharedStubId::polling_page_vectors_safepoint_handler_id); 2980 2981 // Make room for return address (or push it again) 2982 if (!cause_return) { 2983 __ push(rbx); 2984 } 2985 2986 // Save registers, fpu state, and flags 2987 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors); 2988 2989 // The following is basically a call_VM. However, we need the precise 2990 // address of the call in order to generate an oopmap. Hence, we do all the 2991 // work ourselves. 2992 2993 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next: 2994 2995 // The return address must always be correct so that frame constructor never 2996 // sees an invalid pc. 2997 2998 if (!cause_return) { 2999 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack. 3000 // Additionally, rbx is a callee saved register and we can look at it later to determine 3001 // if someone changed the return address for us! 3002 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset())); 3003 __ movptr(Address(rbp, wordSize), rbx); 3004 } 3005 3006 // Do the call 3007 __ mov(c_rarg0, r15_thread); 3008 __ call(RuntimeAddress(call_ptr)); 3009 3010 // Set an oopmap for the call site. This oopmap will map all 3011 // oop-registers and debug-info registers as callee-saved. This 3012 // will allow deoptimization at this safepoint to find all possible 3013 // debug-info recordings, as well as let GC find all oops. 3014 3015 oop_maps->add_gc_map( __ pc() - start, map); 3016 3017 Label noException; 3018 3019 __ reset_last_Java_frame(false); 3020 3021 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3022 __ jcc(Assembler::equal, noException); 3023 3024 // Exception pending 3025 3026 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3027 3028 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3029 3030 // No exception case 3031 __ bind(noException); 3032 3033 Label no_adjust; 3034 #ifdef ASSERT 3035 Label bail; 3036 #endif 3037 if (!cause_return) { 3038 Label no_prefix, not_special, check_rex_prefix; 3039 3040 // If our stashed return pc was modified by the runtime we avoid touching it 3041 __ cmpptr(rbx, Address(rbp, wordSize)); 3042 __ jcc(Assembler::notEqual, no_adjust); 3043 3044 // Skip over the poll instruction. 3045 // See NativeInstruction::is_safepoint_poll() 3046 // Possible encodings: 3047 // 85 00 test %eax,(%rax) 3048 // 85 01 test %eax,(%rcx) 3049 // 85 02 test %eax,(%rdx) 3050 // 85 03 test %eax,(%rbx) 3051 // 85 06 test %eax,(%rsi) 3052 // 85 07 test %eax,(%rdi) 3053 // 3054 // 41 85 00 test %eax,(%r8) 3055 // 41 85 01 test %eax,(%r9) 3056 // 41 85 02 test %eax,(%r10) 3057 // 41 85 03 test %eax,(%r11) 3058 // 41 85 06 test %eax,(%r14) 3059 // 41 85 07 test %eax,(%r15) 3060 // 3061 // 85 04 24 test %eax,(%rsp) 3062 // 41 85 04 24 test %eax,(%r12) 3063 // 85 45 00 test %eax,0x0(%rbp) 3064 // 41 85 45 00 test %eax,0x0(%r13) 3065 // 3066 // Notes: 3067 // Format of legacy MAP0 test instruction:- 3068 // [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32] 3069 // o For safepoint polling instruction "test %eax,(%rax)", encoding of first register 3070 // operand and base register of memory operand is b/w [0-8), hence we do not require 3071 // additional REX prefix where REX.B bit stores MSB bit of register encoding, which 3072 // is why two bytes encoding is sufficient here. 3073 // o For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE 3074 // register of memory operand is 1000, thus we need additional REX prefix in this case, 3075 // there by adding additional byte to instruction encoding. 3076 // o In case BASE register is one of the 32 extended GPR registers available only on targets 3077 // supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold 3078 // most significant two bits of 5 bit register encoding. 3079 3080 if (VM_Version::supports_apx_f()) { 3081 __ cmpb(Address(rbx, 0), Assembler::REX2); 3082 __ jccb(Assembler::notEqual, check_rex_prefix); 3083 __ addptr(rbx, 2); 3084 __ bind(check_rex_prefix); 3085 } 3086 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix); 3087 __ jccb(Assembler::notEqual, no_prefix); 3088 __ addptr(rbx, 1); 3089 __ bind(no_prefix); 3090 #ifdef ASSERT 3091 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below 3092 #endif 3093 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values: 3094 // r12/rsp 0x04 3095 // r13/rbp 0x05 3096 __ movzbq(rcx, Address(rbx, 1)); 3097 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05 3098 __ subptr(rcx, 4); // looking for 0x00 .. 0x01 3099 __ cmpptr(rcx, 1); 3100 __ jccb(Assembler::above, not_special); 3101 __ addptr(rbx, 1); 3102 __ bind(not_special); 3103 #ifdef ASSERT 3104 // Verify the correct encoding of the poll we're about to skip. 3105 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl); 3106 __ jcc(Assembler::notEqual, bail); 3107 // Mask out the modrm bits 3108 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask); 3109 // rax encodes to 0, so if the bits are nonzero it's incorrect 3110 __ jcc(Assembler::notZero, bail); 3111 #endif 3112 // Adjust return pc forward to step over the safepoint poll instruction 3113 __ addptr(rbx, 2); 3114 __ movptr(Address(rbp, wordSize), rbx); 3115 } 3116 3117 __ bind(no_adjust); 3118 // Normal exit, restore registers and exit. 3119 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3120 __ ret(0); 3121 3122 #ifdef ASSERT 3123 __ bind(bail); 3124 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected"); 3125 #endif 3126 3127 // Make sure all code is generated 3128 masm->flush(); 3129 3130 // Fill-out other meta info 3131 return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words); 3132 } 3133 3134 // 3135 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss 3136 // 3137 // Generate a stub that calls into vm to find out the proper destination 3138 // of a java call. All the argument registers are live at this point 3139 // but since this is generic code we don't know what they are and the caller 3140 // must do any gc of the args. 3141 // 3142 RuntimeStub* SharedRuntime::generate_resolve_blob(SharedStubId id, address destination) { 3143 assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before"); 3144 assert(is_resolve_id(id), "expected a resolve stub id"); 3145 3146 // allocate space for the code 3147 ResourceMark rm; 3148 3149 const char* name = SharedRuntime::stub_name(id); 3150 CodeBuffer buffer(name, 1552, 512); 3151 MacroAssembler* masm = new MacroAssembler(&buffer); 3152 3153 int frame_size_in_words; 3154 3155 OopMapSet *oop_maps = new OopMapSet(); 3156 OopMap* map = nullptr; 3157 3158 int start = __ offset(); 3159 3160 // No need to save vector registers since they are caller-saved anyway. 3161 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false); 3162 3163 int frame_complete = __ offset(); 3164 3165 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 3166 3167 __ mov(c_rarg0, r15_thread); 3168 3169 __ call(RuntimeAddress(destination)); 3170 3171 3172 // Set an oopmap for the call site. 3173 // We need this not only for callee-saved registers, but also for volatile 3174 // registers that the compiler might be keeping live across a safepoint. 3175 3176 oop_maps->add_gc_map( __ offset() - start, map); 3177 3178 // rax contains the address we are going to jump to assuming no exception got installed 3179 3180 // clear last_Java_sp 3181 __ reset_last_Java_frame(false); 3182 // check for pending exceptions 3183 Label pending; 3184 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3185 __ jcc(Assembler::notEqual, pending); 3186 3187 // get the returned Method* 3188 __ get_vm_result_2(rbx); 3189 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx); 3190 3191 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 3192 3193 RegisterSaver::restore_live_registers(masm); 3194 3195 // We are back to the original state on entry and ready to go. 3196 3197 __ jmp(rax); 3198 3199 // Pending exception after the safepoint 3200 3201 __ bind(pending); 3202 3203 RegisterSaver::restore_live_registers(masm); 3204 3205 // exception pending => remove activation and forward to exception handler 3206 3207 __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD); 3208 3209 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 3210 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3211 3212 // ------------- 3213 // make sure all code is generated 3214 masm->flush(); 3215 3216 // return the blob 3217 // frame_size_words or bytes?? 3218 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); 3219 } 3220 3221 // Continuation point for throwing of implicit exceptions that are 3222 // not handled in the current activation. Fabricates an exception 3223 // oop and initiates normal exception dispatching in this 3224 // frame. Since we need to preserve callee-saved values (currently 3225 // only for C2, but done for C1 as well) we need a callee-saved oop 3226 // map and therefore have to make these stubs into RuntimeStubs 3227 // rather than BufferBlobs. If the compiler needs all registers to 3228 // be preserved between the fault point and the exception handler 3229 // then it must assume responsibility for that in 3230 // AbstractCompiler::continuation_for_implicit_null_exception or 3231 // continuation_for_implicit_division_by_zero_exception. All other 3232 // implicit exceptions (e.g., NullPointerException or 3233 // AbstractMethodError on entry) are either at call sites or 3234 // otherwise assume that stack unwinding will be initiated, so 3235 // caller saved registers were assumed volatile in the compiler. 3236 RuntimeStub* SharedRuntime::generate_throw_exception(SharedStubId id, address runtime_entry) { 3237 assert(is_throw_id(id), "expected a throw stub id"); 3238 3239 const char* name = SharedRuntime::stub_name(id); 3240 3241 // Information about frame layout at time of blocking runtime call. 3242 // Note that we only have to preserve callee-saved registers since 3243 // the compilers are responsible for supplying a continuation point 3244 // if they expect all registers to be preserved. 3245 enum layout { 3246 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt, 3247 rbp_off2, 3248 return_off, 3249 return_off2, 3250 framesize // inclusive of return address 3251 }; 3252 3253 int insts_size = 512; 3254 int locs_size = 64; 3255 3256 ResourceMark rm; 3257 const char* timer_msg = "SharedRuntime generate_throw_exception"; 3258 TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime)); 3259 3260 CodeBuffer code(name, insts_size, locs_size); 3261 OopMapSet* oop_maps = new OopMapSet(); 3262 MacroAssembler* masm = new MacroAssembler(&code); 3263 3264 address start = __ pc(); 3265 3266 // This is an inlined and slightly modified version of call_VM 3267 // which has the ability to fetch the return PC out of 3268 // thread-local storage and also sets up last_Java_sp slightly 3269 // differently than the real call_VM 3270 3271 __ enter(); // required for proper stackwalking of RuntimeStub frame 3272 3273 assert(is_even(framesize/2), "sp not 16-byte aligned"); 3274 3275 // return address and rbp are already in place 3276 __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog 3277 3278 int frame_complete = __ pc() - start; 3279 3280 // Set up last_Java_sp and last_Java_fp 3281 address the_pc = __ pc(); 3282 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 3283 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack 3284 3285 // Call runtime 3286 __ movptr(c_rarg0, r15_thread); 3287 BLOCK_COMMENT("call runtime_entry"); 3288 __ call(RuntimeAddress(runtime_entry)); 3289 3290 // Generate oop map 3291 OopMap* map = new OopMap(framesize, 0); 3292 3293 oop_maps->add_gc_map(the_pc - start, map); 3294 3295 __ reset_last_Java_frame(true); 3296 3297 __ leave(); // required for proper stackwalking of RuntimeStub frame 3298 3299 // check for pending exceptions 3300 #ifdef ASSERT 3301 Label L; 3302 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3303 __ jcc(Assembler::notEqual, L); 3304 __ should_not_reach_here(); 3305 __ bind(L); 3306 #endif // ASSERT 3307 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3308 3309 3310 // codeBlob framesize is in words (not VMRegImpl::slot_size) 3311 RuntimeStub* stub = 3312 RuntimeStub::new_runtime_stub(name, 3313 &code, 3314 frame_complete, 3315 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3316 oop_maps, false); 3317 return stub; 3318 } 3319 3320 //------------------------------Montgomery multiplication------------------------ 3321 // 3322 3323 #ifndef _WINDOWS 3324 3325 // Subtract 0:b from carry:a. Return carry. 3326 static julong 3327 sub(julong a[], julong b[], julong carry, long len) { 3328 long long i = 0, cnt = len; 3329 julong tmp; 3330 asm volatile("clc; " 3331 "0: ; " 3332 "mov (%[b], %[i], 8), %[tmp]; " 3333 "sbb %[tmp], (%[a], %[i], 8); " 3334 "inc %[i]; dec %[cnt]; " 3335 "jne 0b; " 3336 "mov %[carry], %[tmp]; sbb $0, %[tmp]; " 3337 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp) 3338 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry) 3339 : "memory"); 3340 return tmp; 3341 } 3342 3343 // Multiply (unsigned) Long A by Long B, accumulating the double- 3344 // length result into the accumulator formed of T0, T1, and T2. 3345 #define MACC(A, B, T0, T1, T2) \ 3346 do { \ 3347 unsigned long hi, lo; \ 3348 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3349 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3350 : "r"(A), "a"(B) : "cc"); \ 3351 } while(0) 3352 3353 // As above, but add twice the double-length result into the 3354 // accumulator. 3355 #define MACC2(A, B, T0, T1, T2) \ 3356 do { \ 3357 unsigned long hi, lo; \ 3358 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \ 3359 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3360 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3361 : "r"(A), "a"(B) : "cc"); \ 3362 } while(0) 3363 3364 #else //_WINDOWS 3365 3366 static julong 3367 sub(julong a[], julong b[], julong carry, long len) { 3368 long i; 3369 julong tmp; 3370 unsigned char c = 1; 3371 for (i = 0; i < len; i++) { 3372 c = _addcarry_u64(c, a[i], ~b[i], &tmp); 3373 a[i] = tmp; 3374 } 3375 c = _addcarry_u64(c, carry, ~0, &tmp); 3376 return tmp; 3377 } 3378 3379 // Multiply (unsigned) Long A by Long B, accumulating the double- 3380 // length result into the accumulator formed of T0, T1, and T2. 3381 #define MACC(A, B, T0, T1, T2) \ 3382 do { \ 3383 julong hi, lo; \ 3384 lo = _umul128(A, B, &hi); \ 3385 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3386 c = _addcarry_u64(c, hi, T1, &T1); \ 3387 _addcarry_u64(c, T2, 0, &T2); \ 3388 } while(0) 3389 3390 // As above, but add twice the double-length result into the 3391 // accumulator. 3392 #define MACC2(A, B, T0, T1, T2) \ 3393 do { \ 3394 julong hi, lo; \ 3395 lo = _umul128(A, B, &hi); \ 3396 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3397 c = _addcarry_u64(c, hi, T1, &T1); \ 3398 _addcarry_u64(c, T2, 0, &T2); \ 3399 c = _addcarry_u64(0, lo, T0, &T0); \ 3400 c = _addcarry_u64(c, hi, T1, &T1); \ 3401 _addcarry_u64(c, T2, 0, &T2); \ 3402 } while(0) 3403 3404 #endif //_WINDOWS 3405 3406 // Fast Montgomery multiplication. The derivation of the algorithm is 3407 // in A Cryptographic Library for the Motorola DSP56000, 3408 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3409 3410 static void NOINLINE 3411 montgomery_multiply(julong a[], julong b[], julong n[], 3412 julong m[], julong inv, int len) { 3413 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3414 int i; 3415 3416 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply"); 3417 3418 for (i = 0; i < len; i++) { 3419 int j; 3420 for (j = 0; j < i; j++) { 3421 MACC(a[j], b[i-j], t0, t1, t2); 3422 MACC(m[j], n[i-j], t0, t1, t2); 3423 } 3424 MACC(a[i], b[0], t0, t1, t2); 3425 m[i] = t0 * inv; 3426 MACC(m[i], n[0], t0, t1, t2); 3427 3428 assert(t0 == 0, "broken Montgomery multiply"); 3429 3430 t0 = t1; t1 = t2; t2 = 0; 3431 } 3432 3433 for (i = len; i < 2*len; i++) { 3434 int j; 3435 for (j = i-len+1; j < len; j++) { 3436 MACC(a[j], b[i-j], t0, t1, t2); 3437 MACC(m[j], n[i-j], t0, t1, t2); 3438 } 3439 m[i-len] = t0; 3440 t0 = t1; t1 = t2; t2 = 0; 3441 } 3442 3443 while (t0) 3444 t0 = sub(m, n, t0, len); 3445 } 3446 3447 // Fast Montgomery squaring. This uses asymptotically 25% fewer 3448 // multiplies so it should be up to 25% faster than Montgomery 3449 // multiplication. However, its loop control is more complex and it 3450 // may actually run slower on some machines. 3451 3452 static void NOINLINE 3453 montgomery_square(julong a[], julong n[], 3454 julong m[], julong inv, int len) { 3455 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3456 int i; 3457 3458 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square"); 3459 3460 for (i = 0; i < len; i++) { 3461 int j; 3462 int end = (i+1)/2; 3463 for (j = 0; j < end; j++) { 3464 MACC2(a[j], a[i-j], t0, t1, t2); 3465 MACC(m[j], n[i-j], t0, t1, t2); 3466 } 3467 if ((i & 1) == 0) { 3468 MACC(a[j], a[j], t0, t1, t2); 3469 } 3470 for (; j < i; j++) { 3471 MACC(m[j], n[i-j], t0, t1, t2); 3472 } 3473 m[i] = t0 * inv; 3474 MACC(m[i], n[0], t0, t1, t2); 3475 3476 assert(t0 == 0, "broken Montgomery square"); 3477 3478 t0 = t1; t1 = t2; t2 = 0; 3479 } 3480 3481 for (i = len; i < 2*len; i++) { 3482 int start = i-len+1; 3483 int end = start + (len - start)/2; 3484 int j; 3485 for (j = start; j < end; j++) { 3486 MACC2(a[j], a[i-j], t0, t1, t2); 3487 MACC(m[j], n[i-j], t0, t1, t2); 3488 } 3489 if ((i & 1) == 0) { 3490 MACC(a[j], a[j], t0, t1, t2); 3491 } 3492 for (; j < len; j++) { 3493 MACC(m[j], n[i-j], t0, t1, t2); 3494 } 3495 m[i-len] = t0; 3496 t0 = t1; t1 = t2; t2 = 0; 3497 } 3498 3499 while (t0) 3500 t0 = sub(m, n, t0, len); 3501 } 3502 3503 // Swap words in a longword. 3504 static julong swap(julong x) { 3505 return (x << 32) | (x >> 32); 3506 } 3507 3508 // Copy len longwords from s to d, word-swapping as we go. The 3509 // destination array is reversed. 3510 static void reverse_words(julong *s, julong *d, int len) { 3511 d += len; 3512 while(len-- > 0) { 3513 d--; 3514 *d = swap(*s); 3515 s++; 3516 } 3517 } 3518 3519 // The threshold at which squaring is advantageous was determined 3520 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. 3521 #define MONTGOMERY_SQUARING_THRESHOLD 64 3522 3523 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, 3524 jint len, jlong inv, 3525 jint *m_ints) { 3526 assert(len % 2 == 0, "array length in montgomery_multiply must be even"); 3527 int longwords = len/2; 3528 3529 // Make very sure we don't use so much space that the stack might 3530 // overflow. 512 jints corresponds to an 16384-bit integer and 3531 // will use here a total of 8k bytes of stack space. 3532 int divisor = sizeof(julong) * 4; 3533 guarantee(longwords <= 8192 / divisor, "must be"); 3534 int total_allocation = longwords * sizeof (julong) * 4; 3535 julong *scratch = (julong *)alloca(total_allocation); 3536 3537 // Local scratch arrays 3538 julong 3539 *a = scratch + 0 * longwords, 3540 *b = scratch + 1 * longwords, 3541 *n = scratch + 2 * longwords, 3542 *m = scratch + 3 * longwords; 3543 3544 reverse_words((julong *)a_ints, a, longwords); 3545 reverse_words((julong *)b_ints, b, longwords); 3546 reverse_words((julong *)n_ints, n, longwords); 3547 3548 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords); 3549 3550 reverse_words(m, (julong *)m_ints, longwords); 3551 } 3552 3553 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, 3554 jint len, jlong inv, 3555 jint *m_ints) { 3556 assert(len % 2 == 0, "array length in montgomery_square must be even"); 3557 int longwords = len/2; 3558 3559 // Make very sure we don't use so much space that the stack might 3560 // overflow. 512 jints corresponds to an 16384-bit integer and 3561 // will use here a total of 6k bytes of stack space. 3562 int divisor = sizeof(julong) * 3; 3563 guarantee(longwords <= (8192 / divisor), "must be"); 3564 int total_allocation = longwords * sizeof (julong) * 3; 3565 julong *scratch = (julong *)alloca(total_allocation); 3566 3567 // Local scratch arrays 3568 julong 3569 *a = scratch + 0 * longwords, 3570 *n = scratch + 1 * longwords, 3571 *m = scratch + 2 * longwords; 3572 3573 reverse_words((julong *)a_ints, a, longwords); 3574 reverse_words((julong *)n_ints, n, longwords); 3575 3576 if (len >= MONTGOMERY_SQUARING_THRESHOLD) { 3577 ::montgomery_square(a, n, m, (julong)inv, longwords); 3578 } else { 3579 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords); 3580 } 3581 3582 reverse_words(m, (julong *)m_ints, longwords); 3583 } 3584 3585 #if INCLUDE_JFR 3586 3587 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint. 3588 // It returns a jobject handle to the event writer. 3589 // The handle is dereferenced and the return value is the event writer oop. 3590 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() { 3591 enum layout { 3592 rbp_off, 3593 rbpH_off, 3594 return_off, 3595 return_off2, 3596 framesize // inclusive of return address 3597 }; 3598 3599 const char* name = SharedRuntime::stub_name(SharedStubId::jfr_write_checkpoint_id); 3600 CodeBuffer code(name, 1024, 64); 3601 MacroAssembler* masm = new MacroAssembler(&code); 3602 address start = __ pc(); 3603 3604 __ enter(); 3605 address the_pc = __ pc(); 3606 3607 int frame_complete = the_pc - start; 3608 3609 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 3610 __ movptr(c_rarg0, r15_thread); 3611 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1); 3612 __ reset_last_Java_frame(true); 3613 3614 // rax is jobject handle result, unpack and process it through a barrier. 3615 __ resolve_global_jobject(rax, c_rarg0); 3616 3617 __ leave(); 3618 __ ret(0); 3619 3620 OopMapSet* oop_maps = new OopMapSet(); 3621 OopMap* map = new OopMap(framesize, 1); 3622 oop_maps->add_gc_map(frame_complete, map); 3623 3624 RuntimeStub* stub = 3625 RuntimeStub::new_runtime_stub(name, 3626 &code, 3627 frame_complete, 3628 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3629 oop_maps, 3630 false); 3631 return stub; 3632 } 3633 3634 // For c2: call to return a leased buffer. 3635 RuntimeStub* SharedRuntime::generate_jfr_return_lease() { 3636 enum layout { 3637 rbp_off, 3638 rbpH_off, 3639 return_off, 3640 return_off2, 3641 framesize // inclusive of return address 3642 }; 3643 3644 const char* name = SharedRuntime::stub_name(SharedStubId::jfr_return_lease_id); 3645 CodeBuffer code(name, 1024, 64); 3646 MacroAssembler* masm = new MacroAssembler(&code); 3647 address start = __ pc(); 3648 3649 __ enter(); 3650 address the_pc = __ pc(); 3651 3652 int frame_complete = the_pc - start; 3653 3654 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2); 3655 __ movptr(c_rarg0, r15_thread); 3656 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1); 3657 __ reset_last_Java_frame(true); 3658 3659 __ leave(); 3660 __ ret(0); 3661 3662 OopMapSet* oop_maps = new OopMapSet(); 3663 OopMap* map = new OopMap(framesize, 1); 3664 oop_maps->add_gc_map(frame_complete, map); 3665 3666 RuntimeStub* stub = 3667 RuntimeStub::new_runtime_stub(name, 3668 &code, 3669 frame_complete, 3670 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3671 oop_maps, 3672 false); 3673 return stub; 3674 } 3675 3676 #endif // INCLUDE_JFR 3677