1 /* 2 * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #ifndef _WINDOWS 27 #include "alloca.h" 28 #endif 29 #include "asm/macroAssembler.hpp" 30 #include "asm/macroAssembler.inline.hpp" 31 #include "code/compiledIC.hpp" 32 #include "code/debugInfoRec.hpp" 33 #include "code/nativeInst.hpp" 34 #include "code/vtableStubs.hpp" 35 #include "compiler/oopMap.hpp" 36 #include "gc/shared/collectedHeap.hpp" 37 #include "gc/shared/gcLocker.hpp" 38 #include "gc/shared/barrierSet.hpp" 39 #include "gc/shared/barrierSetAssembler.hpp" 40 #include "interpreter/interpreter.hpp" 41 #include "logging/log.hpp" 42 #include "memory/resourceArea.hpp" 43 #include "memory/universe.hpp" 44 #include "oops/klass.inline.hpp" 45 #include "oops/method.inline.hpp" 46 #include "prims/methodHandles.hpp" 47 #include "runtime/continuation.hpp" 48 #include "runtime/continuationEntry.inline.hpp" 49 #include "runtime/globals.hpp" 50 #include "runtime/jniHandles.hpp" 51 #include "runtime/safepointMechanism.hpp" 52 #include "runtime/sharedRuntime.hpp" 53 #include "runtime/signature.hpp" 54 #include "runtime/stubRoutines.hpp" 55 #include "runtime/timerTrace.hpp" 56 #include "runtime/vframeArray.hpp" 57 #include "runtime/vm_version.hpp" 58 #include "utilities/align.hpp" 59 #include "utilities/checkedCast.hpp" 60 #include "utilities/formatBuffer.hpp" 61 #include "vmreg_x86.inline.hpp" 62 #ifdef COMPILER1 63 #include "c1/c1_Runtime1.hpp" 64 #endif 65 #ifdef COMPILER2 66 #include "opto/runtime.hpp" 67 #endif 68 #if INCLUDE_JVMCI 69 #include "jvmci/jvmciJavaClasses.hpp" 70 #endif 71 72 #define __ masm-> 73 74 #ifdef PRODUCT 75 #define BLOCK_COMMENT(str) /* nothing */ 76 #else 77 #define BLOCK_COMMENT(str) __ block_comment(str) 78 #endif // PRODUCT 79 80 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size; 81 82 class RegisterSaver { 83 // Capture info about frame layout. Layout offsets are in jint 84 // units because compiler frame slots are jints. 85 #define XSAVE_AREA_BEGIN 160 86 #define XSAVE_AREA_YMM_BEGIN 576 87 #define XSAVE_AREA_EGPRS 960 88 #define XSAVE_AREA_OPMASK_BEGIN 1088 89 #define XSAVE_AREA_ZMM_BEGIN 1152 90 #define XSAVE_AREA_UPPERBANK 1664 91 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off 92 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off 93 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off 94 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off 95 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off 96 enum layout { 97 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area 98 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area 99 DEF_XMM_OFFS(0), 100 DEF_XMM_OFFS(1), 101 // 2..15 are implied in range usage 102 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 103 DEF_YMM_OFFS(0), 104 DEF_YMM_OFFS(1), 105 // 2..15 are implied in range usage 106 r31_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt, 107 r31H_off, 108 r30_off, r30H_off, 109 r29_off, r29H_off, 110 r28_off, r28H_off, 111 r27_off, r27H_off, 112 r26_off, r26H_off, 113 r25_off, r25H_off, 114 r24_off, r24H_off, 115 r23_off, r23H_off, 116 r22_off, r22H_off, 117 r21_off, r21H_off, 118 r20_off, r20H_off, 119 r19_off, r19H_off, 120 r18_off, r18H_off, 121 r17_off, r17H_off, 122 r16_off, r16H_off, 123 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 124 DEF_OPMASK_OFFS(0), 125 DEF_OPMASK_OFFS(1), 126 // 2..7 are implied in range usage 127 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 128 DEF_ZMM_OFFS(0), 129 DEF_ZMM_OFFS(1), 130 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt, 131 DEF_ZMM_UPPER_OFFS(16), 132 DEF_ZMM_UPPER_OFFS(17), 133 // 18..31 are implied in range usage 134 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt), 135 fpu_stateH_end, 136 r15_off, r15H_off, 137 r14_off, r14H_off, 138 r13_off, r13H_off, 139 r12_off, r12H_off, 140 r11_off, r11H_off, 141 r10_off, r10H_off, 142 r9_off, r9H_off, 143 r8_off, r8H_off, 144 rdi_off, rdiH_off, 145 rsi_off, rsiH_off, 146 ignore_off, ignoreH_off, // extra copy of rbp 147 rsp_off, rspH_off, 148 rbx_off, rbxH_off, 149 rdx_off, rdxH_off, 150 rcx_off, rcxH_off, 151 rax_off, raxH_off, 152 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state 153 align_off, alignH_off, 154 flags_off, flagsH_off, 155 // The frame sender code expects that rbp will be in the "natural" place and 156 // will override any oopMap setting for it. We must therefore force the layout 157 // so that it agrees with the frame sender code. 158 rbp_off, rbpH_off, // copy of rbp we will restore 159 return_off, returnH_off, // slot for return address 160 reg_save_size // size in compiler stack slots 161 }; 162 163 public: 164 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors); 165 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false); 166 167 // Offsets into the register save area 168 // Used by deoptimization when it is managing result register 169 // values on its own 170 171 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; } 172 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; } 173 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; } 174 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; } 175 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; } 176 177 // During deoptimization only the result registers need to be restored, 178 // all the other values have already been extracted. 179 static void restore_result_registers(MacroAssembler* masm); 180 }; 181 182 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) { 183 int off = 0; 184 int num_xmm_regs = XMMRegister::available_xmm_registers(); 185 #if COMPILER2_OR_JVMCI 186 if (save_wide_vectors && UseAVX == 0) { 187 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX 188 } 189 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 190 #else 191 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI 192 #endif 193 194 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated 195 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs); 196 // OopMap frame size is in compiler stack slots (jint's) not bytes or words 197 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 198 // CodeBlob frame size is in words. 199 int frame_size_in_words = frame_size_in_bytes / wordSize; 200 *total_frame_words = frame_size_in_words; 201 202 // Save registers, fpu state, and flags. 203 // We assume caller has already pushed the return address onto the 204 // stack, so rsp is 8-byte aligned here. 205 // We push rpb twice in this sequence because we want the real rbp 206 // to be under the return like a normal enter. 207 208 __ enter(); // rsp becomes 16-byte aligned here 209 __ pushf(); 210 // Make sure rsp stays 16-byte aligned 211 __ subq(rsp, 8); 212 // Push CPU state in multiple of 16 bytes 213 __ save_legacy_gprs(); 214 __ push_FPU_state(); 215 216 217 // push cpu state handles this on EVEX enabled targets 218 if (save_wide_vectors) { 219 // Save upper half of YMM registers(0..15) 220 int base_addr = XSAVE_AREA_YMM_BEGIN; 221 for (int n = 0; n < 16; n++) { 222 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n)); 223 } 224 if (VM_Version::supports_evex()) { 225 // Save upper half of ZMM registers(0..15) 226 base_addr = XSAVE_AREA_ZMM_BEGIN; 227 for (int n = 0; n < 16; n++) { 228 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n)); 229 } 230 // Save full ZMM registers(16..num_xmm_regs) 231 base_addr = XSAVE_AREA_UPPERBANK; 232 off = 0; 233 int vector_len = Assembler::AVX_512bit; 234 for (int n = 16; n < num_xmm_regs; n++) { 235 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 236 } 237 #if COMPILER2_OR_JVMCI 238 base_addr = XSAVE_AREA_OPMASK_BEGIN; 239 off = 0; 240 for(int n = 0; n < KRegister::number_of_registers; n++) { 241 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 242 } 243 #endif 244 } 245 } else { 246 if (VM_Version::supports_evex()) { 247 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 248 int base_addr = XSAVE_AREA_UPPERBANK; 249 off = 0; 250 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 251 for (int n = 16; n < num_xmm_regs; n++) { 252 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 253 } 254 #if COMPILER2_OR_JVMCI 255 base_addr = XSAVE_AREA_OPMASK_BEGIN; 256 off = 0; 257 for(int n = 0; n < KRegister::number_of_registers; n++) { 258 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 259 } 260 #endif 261 } 262 } 263 264 #if COMPILER2_OR_JVMCI 265 if (UseAPX) { 266 int base_addr = XSAVE_AREA_EGPRS; 267 off = 0; 268 for(int n = 16; n < Register::number_of_registers; n++) { 269 __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n)); 270 } 271 } 272 #endif 273 274 __ vzeroupper(); 275 if (frame::arg_reg_save_area_bytes != 0) { 276 // Allocate argument register save area 277 __ subptr(rsp, frame::arg_reg_save_area_bytes); 278 } 279 280 // Set an oopmap for the call site. This oopmap will map all 281 // oop-registers and debug-info registers as callee-saved. This 282 // will allow deoptimization at this safepoint to find all possible 283 // debug-info recordings, as well as let GC find all oops. 284 285 OopMapSet *oop_maps = new OopMapSet(); 286 OopMap* map = new OopMap(frame_size_in_slots, 0); 287 288 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x)) 289 290 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg()); 291 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg()); 292 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg()); 293 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg()); 294 // rbp location is known implicitly by the frame sender code, needs no oopmap 295 // and the location where rbp was saved by is ignored 296 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg()); 297 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg()); 298 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg()); 299 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg()); 300 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg()); 301 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg()); 302 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg()); 303 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg()); 304 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg()); 305 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg()); 306 307 if (UseAPX) { 308 map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg()); 309 map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg()); 310 map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg()); 311 map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg()); 312 map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg()); 313 map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg()); 314 map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg()); 315 map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg()); 316 map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg()); 317 map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg()); 318 map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg()); 319 map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg()); 320 map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg()); 321 map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg()); 322 map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg()); 323 map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg()); 324 } 325 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 326 // on EVEX enabled targets, we get it included in the xsave area 327 off = xmm0_off; 328 int delta = xmm1_off - off; 329 for (int n = 0; n < 16; n++) { 330 XMMRegister xmm_name = as_XMMRegister(n); 331 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()); 332 off += delta; 333 } 334 if (UseAVX > 2) { 335 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 336 off = zmm16_off; 337 delta = zmm17_off - off; 338 for (int n = 16; n < num_xmm_regs; n++) { 339 XMMRegister zmm_name = as_XMMRegister(n); 340 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()); 341 off += delta; 342 } 343 } 344 345 #if COMPILER2_OR_JVMCI 346 if (save_wide_vectors) { 347 // Save upper half of YMM registers(0..15) 348 off = ymm0_off; 349 delta = ymm1_off - ymm0_off; 350 for (int n = 0; n < 16; n++) { 351 XMMRegister ymm_name = as_XMMRegister(n); 352 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4)); 353 off += delta; 354 } 355 if (VM_Version::supports_evex()) { 356 // Save upper half of ZMM registers(0..15) 357 off = zmm0_off; 358 delta = zmm1_off - zmm0_off; 359 for (int n = 0; n < 16; n++) { 360 XMMRegister zmm_name = as_XMMRegister(n); 361 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8)); 362 off += delta; 363 } 364 } 365 } 366 #endif // COMPILER2_OR_JVMCI 367 368 // %%% These should all be a waste but we'll keep things as they were for now 369 if (true) { 370 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next()); 371 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next()); 372 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next()); 373 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next()); 374 // rbp location is known implicitly by the frame sender code, needs no oopmap 375 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next()); 376 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next()); 377 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next()); 378 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next()); 379 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next()); 380 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next()); 381 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next()); 382 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next()); 383 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next()); 384 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next()); 385 if (UseAPX) { 386 map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next()); 387 map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next()); 388 map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next()); 389 map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next()); 390 map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next()); 391 map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next()); 392 map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next()); 393 map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next()); 394 map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next()); 395 map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next()); 396 map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next()); 397 map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next()); 398 map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next()); 399 map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next()); 400 map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next()); 401 map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next()); 402 } 403 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 404 // on EVEX enabled targets, we get it included in the xsave area 405 off = xmm0H_off; 406 delta = xmm1H_off - off; 407 for (int n = 0; n < 16; n++) { 408 XMMRegister xmm_name = as_XMMRegister(n); 409 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next()); 410 off += delta; 411 } 412 if (UseAVX > 2) { 413 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 414 off = zmm16H_off; 415 delta = zmm17H_off - off; 416 for (int n = 16; n < num_xmm_regs; n++) { 417 XMMRegister zmm_name = as_XMMRegister(n); 418 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next()); 419 off += delta; 420 } 421 } 422 } 423 424 return map; 425 } 426 427 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) { 428 int num_xmm_regs = XMMRegister::available_xmm_registers(); 429 if (frame::arg_reg_save_area_bytes != 0) { 430 // Pop arg register save area 431 __ addptr(rsp, frame::arg_reg_save_area_bytes); 432 } 433 434 #if COMPILER2_OR_JVMCI 435 if (restore_wide_vectors) { 436 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); 437 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 438 } 439 #else 440 assert(!restore_wide_vectors, "vectors are generated only by C2"); 441 #endif 442 443 __ vzeroupper(); 444 445 // On EVEX enabled targets everything is handled in pop fpu state 446 if (restore_wide_vectors) { 447 // Restore upper half of YMM registers (0..15) 448 int base_addr = XSAVE_AREA_YMM_BEGIN; 449 for (int n = 0; n < 16; n++) { 450 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16)); 451 } 452 if (VM_Version::supports_evex()) { 453 // Restore upper half of ZMM registers (0..15) 454 base_addr = XSAVE_AREA_ZMM_BEGIN; 455 for (int n = 0; n < 16; n++) { 456 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32)); 457 } 458 // Restore full ZMM registers(16..num_xmm_regs) 459 base_addr = XSAVE_AREA_UPPERBANK; 460 int vector_len = Assembler::AVX_512bit; 461 int off = 0; 462 for (int n = 16; n < num_xmm_regs; n++) { 463 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 464 } 465 #if COMPILER2_OR_JVMCI 466 base_addr = XSAVE_AREA_OPMASK_BEGIN; 467 off = 0; 468 for (int n = 0; n < KRegister::number_of_registers; n++) { 469 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 470 } 471 #endif 472 } 473 } else { 474 if (VM_Version::supports_evex()) { 475 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 476 int base_addr = XSAVE_AREA_UPPERBANK; 477 int off = 0; 478 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 479 for (int n = 16; n < num_xmm_regs; n++) { 480 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 481 } 482 #if COMPILER2_OR_JVMCI 483 base_addr = XSAVE_AREA_OPMASK_BEGIN; 484 off = 0; 485 for (int n = 0; n < KRegister::number_of_registers; n++) { 486 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 487 } 488 #endif 489 } 490 } 491 492 #if COMPILER2_OR_JVMCI 493 if (UseAPX) { 494 int base_addr = XSAVE_AREA_EGPRS; 495 int off = 0; 496 for (int n = 16; n < Register::number_of_registers; n++) { 497 __ movq(as_Register(n), Address(rsp, base_addr+(off++*8))); 498 } 499 } 500 #endif 501 502 // Recover CPU state 503 __ pop_FPU_state(); 504 __ restore_legacy_gprs(); 505 __ addq(rsp, 8); 506 __ popf(); 507 // Get the rbp described implicitly by the calling convention (no oopMap) 508 __ pop(rbp); 509 } 510 511 void RegisterSaver::restore_result_registers(MacroAssembler* masm) { 512 513 // Just restore result register. Only used by deoptimization. By 514 // now any callee save register that needs to be restored to a c2 515 // caller of the deoptee has been extracted into the vframeArray 516 // and will be stuffed into the c2i adapter we create for later 517 // restoration so only result registers need to be restored here. 518 519 // Restore fp result register 520 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes())); 521 // Restore integer result register 522 __ movptr(rax, Address(rsp, rax_offset_in_bytes())); 523 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes())); 524 525 // Pop all of the register save are off the stack except the return address 526 __ addptr(rsp, return_offset_in_bytes()); 527 } 528 529 // Is vector's size (in bytes) bigger than a size saved by default? 530 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions. 531 bool SharedRuntime::is_wide_vector(int size) { 532 return size > 16; 533 } 534 535 // --------------------------------------------------------------------------- 536 // Read the array of BasicTypes from a signature, and compute where the 537 // arguments should go. Values in the VMRegPair regs array refer to 4-byte 538 // quantities. Values less than VMRegImpl::stack0 are registers, those above 539 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer 540 // as framesizes are fixed. 541 // VMRegImpl::stack0 refers to the first slot 0(sp). 542 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher. 543 // Register up to Register::number_of_registers are the 64-bit 544 // integer registers. 545 546 // Note: the INPUTS in sig_bt are in units of Java argument words, which are 547 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit 548 // units regardless of build. Of course for i486 there is no 64 bit build 549 550 // The Java calling convention is a "shifted" version of the C ABI. 551 // By skipping the first C ABI register we can call non-static jni methods 552 // with small numbers of arguments without having to shuffle the arguments 553 // at all. Since we control the java ABI we ought to at least get some 554 // advantage out of it. 555 556 int SharedRuntime::java_calling_convention(const BasicType *sig_bt, 557 VMRegPair *regs, 558 int total_args_passed) { 559 560 // Create the mapping between argument positions and 561 // registers. 562 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = { 563 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5 564 }; 565 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = { 566 j_farg0, j_farg1, j_farg2, j_farg3, 567 j_farg4, j_farg5, j_farg6, j_farg7 568 }; 569 570 571 uint int_args = 0; 572 uint fp_args = 0; 573 uint stk_args = 0; 574 575 for (int i = 0; i < total_args_passed; i++) { 576 switch (sig_bt[i]) { 577 case T_BOOLEAN: 578 case T_CHAR: 579 case T_BYTE: 580 case T_SHORT: 581 case T_INT: 582 if (int_args < Argument::n_int_register_parameters_j) { 583 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 584 } else { 585 stk_args = align_up(stk_args, 2); 586 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 587 stk_args += 1; 588 } 589 break; 590 case T_VOID: 591 // halves of T_LONG or T_DOUBLE 592 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 593 regs[i].set_bad(); 594 break; 595 case T_LONG: 596 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 597 // fall through 598 case T_OBJECT: 599 case T_ARRAY: 600 case T_ADDRESS: 601 if (int_args < Argument::n_int_register_parameters_j) { 602 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 603 } else { 604 stk_args = align_up(stk_args, 2); 605 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 606 stk_args += 2; 607 } 608 break; 609 case T_FLOAT: 610 if (fp_args < Argument::n_float_register_parameters_j) { 611 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 612 } else { 613 stk_args = align_up(stk_args, 2); 614 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 615 stk_args += 1; 616 } 617 break; 618 case T_DOUBLE: 619 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 620 if (fp_args < Argument::n_float_register_parameters_j) { 621 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 622 } else { 623 stk_args = align_up(stk_args, 2); 624 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 625 stk_args += 2; 626 } 627 break; 628 default: 629 ShouldNotReachHere(); 630 break; 631 } 632 } 633 634 return stk_args; 635 } 636 637 // Patch the callers callsite with entry to compiled code if it exists. 638 static void patch_callers_callsite(MacroAssembler *masm) { 639 Label L; 640 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 641 __ jcc(Assembler::equal, L); 642 643 // Save the current stack pointer 644 __ mov(r13, rsp); 645 // Schedule the branch target address early. 646 // Call into the VM to patch the caller, then jump to compiled callee 647 // rax isn't live so capture return address while we easily can 648 __ movptr(rax, Address(rsp, 0)); 649 650 // align stack so push_CPU_state doesn't fault 651 __ andptr(rsp, -(StackAlignmentInBytes)); 652 __ push_CPU_state(); 653 __ vzeroupper(); 654 // VM needs caller's callsite 655 // VM needs target method 656 // This needs to be a long call since we will relocate this adapter to 657 // the codeBuffer and it may not reach 658 659 // Allocate argument register save area 660 if (frame::arg_reg_save_area_bytes != 0) { 661 __ subptr(rsp, frame::arg_reg_save_area_bytes); 662 } 663 __ mov(c_rarg0, rbx); 664 __ mov(c_rarg1, rax); 665 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite))); 666 667 // De-allocate argument register save area 668 if (frame::arg_reg_save_area_bytes != 0) { 669 __ addptr(rsp, frame::arg_reg_save_area_bytes); 670 } 671 672 __ vzeroupper(); 673 __ pop_CPU_state(); 674 // restore sp 675 __ mov(rsp, r13); 676 __ bind(L); 677 } 678 679 680 static void gen_c2i_adapter(MacroAssembler *masm, 681 int total_args_passed, 682 int comp_args_on_stack, 683 const BasicType *sig_bt, 684 const VMRegPair *regs, 685 Label& skip_fixup) { 686 // Before we get into the guts of the C2I adapter, see if we should be here 687 // at all. We've come from compiled code and are attempting to jump to the 688 // interpreter, which means the caller made a static call to get here 689 // (vcalls always get a compiled target if there is one). Check for a 690 // compiled target. If there is one, we need to patch the caller's call. 691 patch_callers_callsite(masm); 692 693 __ bind(skip_fixup); 694 695 // Since all args are passed on the stack, total_args_passed * 696 // Interpreter::stackElementSize is the space we need. 697 698 assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed); 699 700 int extraspace = (total_args_passed * Interpreter::stackElementSize); 701 702 // stack is aligned, keep it that way 703 // This is not currently needed or enforced by the interpreter, but 704 // we might as well conform to the ABI. 705 extraspace = align_up(extraspace, 2*wordSize); 706 707 // set senderSP value 708 __ lea(r13, Address(rsp, wordSize)); 709 710 #ifdef ASSERT 711 __ check_stack_alignment(r13, "sender stack not aligned"); 712 #endif 713 if (extraspace > 0) { 714 // Pop the return address 715 __ pop(rax); 716 717 __ subptr(rsp, extraspace); 718 719 // Push the return address 720 __ push(rax); 721 722 // Account for the return address location since we store it first rather 723 // than hold it in a register across all the shuffling 724 extraspace += wordSize; 725 } 726 727 #ifdef ASSERT 728 __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax); 729 #endif 730 731 // Now write the args into the outgoing interpreter space 732 for (int i = 0; i < total_args_passed; i++) { 733 if (sig_bt[i] == T_VOID) { 734 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 735 continue; 736 } 737 738 // offset to start parameters 739 int st_off = (total_args_passed - i) * Interpreter::stackElementSize; 740 int next_off = st_off - Interpreter::stackElementSize; 741 742 // Say 4 args: 743 // i st_off 744 // 0 32 T_LONG 745 // 1 24 T_VOID 746 // 2 16 T_OBJECT 747 // 3 8 T_BOOL 748 // - 0 return address 749 // 750 // However to make thing extra confusing. Because we can fit a long/double in 751 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter 752 // leaves one slot empty and only stores to a single slot. In this case the 753 // slot that is occupied is the T_VOID slot. See I said it was confusing. 754 755 VMReg r_1 = regs[i].first(); 756 VMReg r_2 = regs[i].second(); 757 if (!r_1->is_valid()) { 758 assert(!r_2->is_valid(), ""); 759 continue; 760 } 761 if (r_1->is_stack()) { 762 // memory to memory use rax 763 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace; 764 if (!r_2->is_valid()) { 765 // sign extend?? 766 __ movl(rax, Address(rsp, ld_off)); 767 __ movptr(Address(rsp, st_off), rax); 768 769 } else { 770 771 __ movq(rax, Address(rsp, ld_off)); 772 773 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 774 // T_DOUBLE and T_LONG use two slots in the interpreter 775 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 776 // ld_off == LSW, ld_off+wordSize == MSW 777 // st_off == MSW, next_off == LSW 778 __ movq(Address(rsp, next_off), rax); 779 #ifdef ASSERT 780 // Overwrite the unused slot with known junk 781 __ mov64(rax, CONST64(0xdeadffffdeadaaaa)); 782 __ movptr(Address(rsp, st_off), rax); 783 #endif /* ASSERT */ 784 } else { 785 __ movq(Address(rsp, st_off), rax); 786 } 787 } 788 } else if (r_1->is_Register()) { 789 Register r = r_1->as_Register(); 790 if (!r_2->is_valid()) { 791 // must be only an int (or less ) so move only 32bits to slot 792 // why not sign extend?? 793 __ movl(Address(rsp, st_off), r); 794 } else { 795 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 796 // T_DOUBLE and T_LONG use two slots in the interpreter 797 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 798 // long/double in gpr 799 #ifdef ASSERT 800 // Overwrite the unused slot with known junk 801 __ mov64(rax, CONST64(0xdeadffffdeadaaab)); 802 __ movptr(Address(rsp, st_off), rax); 803 #endif /* ASSERT */ 804 __ movq(Address(rsp, next_off), r); 805 } else { 806 __ movptr(Address(rsp, st_off), r); 807 } 808 } 809 } else { 810 assert(r_1->is_XMMRegister(), ""); 811 if (!r_2->is_valid()) { 812 // only a float use just part of the slot 813 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister()); 814 } else { 815 #ifdef ASSERT 816 // Overwrite the unused slot with known junk 817 __ mov64(rax, CONST64(0xdeadffffdeadaaac)); 818 __ movptr(Address(rsp, st_off), rax); 819 #endif /* ASSERT */ 820 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister()); 821 } 822 } 823 } 824 825 // Schedule the branch target address early. 826 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset()))); 827 __ jmp(rcx); 828 } 829 830 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg, 831 address code_start, address code_end, 832 Label& L_ok) { 833 Label L_fail; 834 __ lea(temp_reg, AddressLiteral(code_start, relocInfo::none)); 835 __ cmpptr(pc_reg, temp_reg); 836 __ jcc(Assembler::belowEqual, L_fail); 837 __ lea(temp_reg, AddressLiteral(code_end, relocInfo::none)); 838 __ cmpptr(pc_reg, temp_reg); 839 __ jcc(Assembler::below, L_ok); 840 __ bind(L_fail); 841 } 842 843 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm, 844 int total_args_passed, 845 int comp_args_on_stack, 846 const BasicType *sig_bt, 847 const VMRegPair *regs) { 848 849 // Note: r13 contains the senderSP on entry. We must preserve it since 850 // we may do a i2c -> c2i transition if we lose a race where compiled 851 // code goes non-entrant while we get args ready. 852 // In addition we use r13 to locate all the interpreter args as 853 // we must align the stack to 16 bytes on an i2c entry else we 854 // lose alignment we expect in all compiled code and register 855 // save code can segv when fxsave instructions find improperly 856 // aligned stack pointer. 857 858 // Adapters can be frameless because they do not require the caller 859 // to perform additional cleanup work, such as correcting the stack pointer. 860 // An i2c adapter is frameless because the *caller* frame, which is interpreted, 861 // routinely repairs its own stack pointer (from interpreter_frame_last_sp), 862 // even if a callee has modified the stack pointer. 863 // A c2i adapter is frameless because the *callee* frame, which is interpreted, 864 // routinely repairs its caller's stack pointer (from sender_sp, which is set 865 // up via the senderSP register). 866 // In other words, if *either* the caller or callee is interpreted, we can 867 // get the stack pointer repaired after a call. 868 // This is why c2i and i2c adapters cannot be indefinitely composed. 869 // In particular, if a c2i adapter were to somehow call an i2c adapter, 870 // both caller and callee would be compiled methods, and neither would 871 // clean up the stack pointer changes performed by the two adapters. 872 // If this happens, control eventually transfers back to the compiled 873 // caller, but with an uncorrected stack, causing delayed havoc. 874 875 if (VerifyAdapterCalls && 876 (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) { 877 // So, let's test for cascading c2i/i2c adapters right now. 878 // assert(Interpreter::contains($return_addr) || 879 // StubRoutines::contains($return_addr), 880 // "i2c adapter must return to an interpreter frame"); 881 __ block_comment("verify_i2c { "); 882 // Pick up the return address 883 __ movptr(rax, Address(rsp, 0)); 884 Label L_ok; 885 if (Interpreter::code() != nullptr) { 886 range_check(masm, rax, r11, 887 Interpreter::code()->code_start(), 888 Interpreter::code()->code_end(), 889 L_ok); 890 } 891 if (StubRoutines::initial_stubs_code() != nullptr) { 892 range_check(masm, rax, r11, 893 StubRoutines::initial_stubs_code()->code_begin(), 894 StubRoutines::initial_stubs_code()->code_end(), 895 L_ok); 896 } 897 if (StubRoutines::final_stubs_code() != nullptr) { 898 range_check(masm, rax, r11, 899 StubRoutines::final_stubs_code()->code_begin(), 900 StubRoutines::final_stubs_code()->code_end(), 901 L_ok); 902 } 903 const char* msg = "i2c adapter must return to an interpreter frame"; 904 __ block_comment(msg); 905 __ stop(msg); 906 __ bind(L_ok); 907 __ block_comment("} verify_i2ce "); 908 } 909 910 // Must preserve original SP for loading incoming arguments because 911 // we need to align the outgoing SP for compiled code. 912 __ movptr(r11, rsp); 913 914 // Pick up the return address 915 __ pop(rax); 916 917 // Convert 4-byte c2 stack slots to words. 918 int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord; 919 920 if (comp_args_on_stack) { 921 __ subptr(rsp, comp_words_on_stack * wordSize); 922 } 923 924 // Ensure compiled code always sees stack at proper alignment 925 __ andptr(rsp, -16); 926 927 // push the return address and misalign the stack that youngest frame always sees 928 // as far as the placement of the call instruction 929 __ push(rax); 930 931 // Put saved SP in another register 932 const Register saved_sp = rax; 933 __ movptr(saved_sp, r11); 934 935 // Will jump to the compiled code just as if compiled code was doing it. 936 // Pre-load the register-jump target early, to schedule it better. 937 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset()))); 938 939 #if INCLUDE_JVMCI 940 if (EnableJVMCI) { 941 // check if this call should be routed towards a specific entry point 942 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 943 Label no_alternative_target; 944 __ jcc(Assembler::equal, no_alternative_target); 945 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset()))); 946 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 947 __ bind(no_alternative_target); 948 } 949 #endif // INCLUDE_JVMCI 950 951 // Now generate the shuffle code. Pick up all register args and move the 952 // rest through the floating point stack top. 953 for (int i = 0; i < total_args_passed; i++) { 954 if (sig_bt[i] == T_VOID) { 955 // Longs and doubles are passed in native word order, but misaligned 956 // in the 32-bit build. 957 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 958 continue; 959 } 960 961 // Pick up 0, 1 or 2 words from SP+offset. 962 963 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), 964 "scrambled load targets?"); 965 // Load in argument order going down. 966 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize; 967 // Point to interpreter value (vs. tag) 968 int next_off = ld_off - Interpreter::stackElementSize; 969 // 970 // 971 // 972 VMReg r_1 = regs[i].first(); 973 VMReg r_2 = regs[i].second(); 974 if (!r_1->is_valid()) { 975 assert(!r_2->is_valid(), ""); 976 continue; 977 } 978 if (r_1->is_stack()) { 979 // Convert stack slot to an SP offset (+ wordSize to account for return address ) 980 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize; 981 982 // We can use r13 as a temp here because compiled code doesn't need r13 as an input 983 // and if we end up going thru a c2i because of a miss a reasonable value of r13 984 // will be generated. 985 if (!r_2->is_valid()) { 986 // sign extend??? 987 __ movl(r13, Address(saved_sp, ld_off)); 988 __ movptr(Address(rsp, st_off), r13); 989 } else { 990 // 991 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 992 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 993 // So we must adjust where to pick up the data to match the interpreter. 994 // 995 // Interpreter local[n] == MSW, local[n+1] == LSW however locals 996 // are accessed as negative so LSW is at LOW address 997 998 // ld_off is MSW so get LSW 999 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 1000 next_off : ld_off; 1001 __ movq(r13, Address(saved_sp, offset)); 1002 // st_off is LSW (i.e. reg.first()) 1003 __ movq(Address(rsp, st_off), r13); 1004 } 1005 } else if (r_1->is_Register()) { // Register argument 1006 Register r = r_1->as_Register(); 1007 assert(r != rax, "must be different"); 1008 if (r_2->is_valid()) { 1009 // 1010 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 1011 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 1012 // So we must adjust where to pick up the data to match the interpreter. 1013 1014 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 1015 next_off : ld_off; 1016 1017 // this can be a misaligned move 1018 __ movq(r, Address(saved_sp, offset)); 1019 } else { 1020 // sign extend and use a full word? 1021 __ movl(r, Address(saved_sp, ld_off)); 1022 } 1023 } else { 1024 if (!r_2->is_valid()) { 1025 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off)); 1026 } else { 1027 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off)); 1028 } 1029 } 1030 } 1031 1032 __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about 1033 1034 // 6243940 We might end up in handle_wrong_method if 1035 // the callee is deoptimized as we race thru here. If that 1036 // happens we don't want to take a safepoint because the 1037 // caller frame will look interpreted and arguments are now 1038 // "compiled" so it is much better to make this transition 1039 // invisible to the stack walking code. Unfortunately if 1040 // we try and find the callee by normal means a safepoint 1041 // is possible. So we stash the desired callee in the thread 1042 // and the vm will find there should this case occur. 1043 1044 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx); 1045 1046 // put Method* where a c2i would expect should we end up there 1047 // only needed because eof c2 resolve stubs return Method* as a result in 1048 // rax 1049 __ mov(rax, rbx); 1050 __ jmp(r11); 1051 } 1052 1053 // --------------------------------------------------------------- 1054 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm, 1055 int total_args_passed, 1056 int comp_args_on_stack, 1057 const BasicType *sig_bt, 1058 const VMRegPair *regs, 1059 AdapterFingerPrint* fingerprint) { 1060 address i2c_entry = __ pc(); 1061 1062 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs); 1063 1064 // ------------------------------------------------------------------------- 1065 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls 1066 // to the interpreter. The args start out packed in the compiled layout. They 1067 // need to be unpacked into the interpreter layout. This will almost always 1068 // require some stack space. We grow the current (compiled) stack, then repack 1069 // the args. We finally end in a jump to the generic interpreter entry point. 1070 // On exit from the interpreter, the interpreter will restore our SP (lest the 1071 // compiled code, which relies solely on SP and not RBP, get sick). 1072 1073 address c2i_unverified_entry = __ pc(); 1074 Label skip_fixup; 1075 1076 Register data = rax; 1077 Register receiver = j_rarg0; 1078 Register temp = rbx; 1079 1080 { 1081 __ ic_check(1 /* end_alignment */); 1082 __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset())); 1083 // Method might have been compiled since the call site was patched to 1084 // interpreted if that is the case treat it as a miss so we can get 1085 // the call site corrected. 1086 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 1087 __ jcc(Assembler::equal, skip_fixup); 1088 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1089 } 1090 1091 address c2i_entry = __ pc(); 1092 1093 // Class initialization barrier for static methods 1094 address c2i_no_clinit_check_entry = nullptr; 1095 if (VM_Version::supports_fast_class_init_checks()) { 1096 Label L_skip_barrier; 1097 Register method = rbx; 1098 1099 { // Bypass the barrier for non-static methods 1100 Register flags = rscratch1; 1101 __ movl(flags, Address(method, Method::access_flags_offset())); 1102 __ testl(flags, JVM_ACC_STATIC); 1103 __ jcc(Assembler::zero, L_skip_barrier); // non-static 1104 } 1105 1106 Register klass = rscratch1; 1107 __ load_method_holder(klass, method); 1108 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 1109 1110 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1111 1112 __ bind(L_skip_barrier); 1113 c2i_no_clinit_check_entry = __ pc(); 1114 } 1115 1116 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1117 bs->c2i_entry_barrier(masm); 1118 1119 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup); 1120 1121 return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry); 1122 } 1123 1124 int SharedRuntime::c_calling_convention(const BasicType *sig_bt, 1125 VMRegPair *regs, 1126 int total_args_passed) { 1127 1128 // We return the amount of VMRegImpl stack slots we need to reserve for all 1129 // the arguments NOT counting out_preserve_stack_slots. 1130 1131 // NOTE: These arrays will have to change when c1 is ported 1132 #ifdef _WIN64 1133 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1134 c_rarg0, c_rarg1, c_rarg2, c_rarg3 1135 }; 1136 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1137 c_farg0, c_farg1, c_farg2, c_farg3 1138 }; 1139 #else 1140 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1141 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5 1142 }; 1143 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1144 c_farg0, c_farg1, c_farg2, c_farg3, 1145 c_farg4, c_farg5, c_farg6, c_farg7 1146 }; 1147 #endif // _WIN64 1148 1149 1150 uint int_args = 0; 1151 uint fp_args = 0; 1152 uint stk_args = 0; // inc by 2 each time 1153 1154 for (int i = 0; i < total_args_passed; i++) { 1155 switch (sig_bt[i]) { 1156 case T_BOOLEAN: 1157 case T_CHAR: 1158 case T_BYTE: 1159 case T_SHORT: 1160 case T_INT: 1161 if (int_args < Argument::n_int_register_parameters_c) { 1162 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 1163 #ifdef _WIN64 1164 fp_args++; 1165 // Allocate slots for callee to stuff register args the stack. 1166 stk_args += 2; 1167 #endif 1168 } else { 1169 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1170 stk_args += 2; 1171 } 1172 break; 1173 case T_LONG: 1174 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1175 // fall through 1176 case T_OBJECT: 1177 case T_ARRAY: 1178 case T_ADDRESS: 1179 case T_METADATA: 1180 if (int_args < Argument::n_int_register_parameters_c) { 1181 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 1182 #ifdef _WIN64 1183 fp_args++; 1184 stk_args += 2; 1185 #endif 1186 } else { 1187 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1188 stk_args += 2; 1189 } 1190 break; 1191 case T_FLOAT: 1192 if (fp_args < Argument::n_float_register_parameters_c) { 1193 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 1194 #ifdef _WIN64 1195 int_args++; 1196 // Allocate slots for callee to stuff register args the stack. 1197 stk_args += 2; 1198 #endif 1199 } else { 1200 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1201 stk_args += 2; 1202 } 1203 break; 1204 case T_DOUBLE: 1205 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1206 if (fp_args < Argument::n_float_register_parameters_c) { 1207 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 1208 #ifdef _WIN64 1209 int_args++; 1210 // Allocate slots for callee to stuff register args the stack. 1211 stk_args += 2; 1212 #endif 1213 } else { 1214 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1215 stk_args += 2; 1216 } 1217 break; 1218 case T_VOID: // Halves of longs and doubles 1219 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 1220 regs[i].set_bad(); 1221 break; 1222 default: 1223 ShouldNotReachHere(); 1224 break; 1225 } 1226 } 1227 #ifdef _WIN64 1228 // windows abi requires that we always allocate enough stack space 1229 // for 4 64bit registers to be stored down. 1230 if (stk_args < 8) { 1231 stk_args = 8; 1232 } 1233 #endif // _WIN64 1234 1235 return stk_args; 1236 } 1237 1238 int SharedRuntime::vector_calling_convention(VMRegPair *regs, 1239 uint num_bits, 1240 uint total_args_passed) { 1241 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512, 1242 "only certain vector sizes are supported for now"); 1243 1244 static const XMMRegister VEC_ArgReg[32] = { 1245 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 1246 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1247 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23, 1248 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31 1249 }; 1250 1251 uint stk_args = 0; 1252 uint fp_args = 0; 1253 1254 for (uint i = 0; i < total_args_passed; i++) { 1255 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg(); 1256 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15)); 1257 regs[i].set_pair(vmreg->next(next_val), vmreg); 1258 } 1259 1260 return stk_args; 1261 } 1262 1263 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1264 // We always ignore the frame_slots arg and just use the space just below frame pointer 1265 // which by this time is free to use 1266 switch (ret_type) { 1267 case T_FLOAT: 1268 __ movflt(Address(rbp, -wordSize), xmm0); 1269 break; 1270 case T_DOUBLE: 1271 __ movdbl(Address(rbp, -wordSize), xmm0); 1272 break; 1273 case T_VOID: break; 1274 default: { 1275 __ movptr(Address(rbp, -wordSize), rax); 1276 } 1277 } 1278 } 1279 1280 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1281 // We always ignore the frame_slots arg and just use the space just below frame pointer 1282 // which by this time is free to use 1283 switch (ret_type) { 1284 case T_FLOAT: 1285 __ movflt(xmm0, Address(rbp, -wordSize)); 1286 break; 1287 case T_DOUBLE: 1288 __ movdbl(xmm0, Address(rbp, -wordSize)); 1289 break; 1290 case T_VOID: break; 1291 default: { 1292 __ movptr(rax, Address(rbp, -wordSize)); 1293 } 1294 } 1295 } 1296 1297 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1298 for ( int i = first_arg ; i < arg_count ; i++ ) { 1299 if (args[i].first()->is_Register()) { 1300 __ push(args[i].first()->as_Register()); 1301 } else if (args[i].first()->is_XMMRegister()) { 1302 __ subptr(rsp, 2*wordSize); 1303 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister()); 1304 } 1305 } 1306 } 1307 1308 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1309 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) { 1310 if (args[i].first()->is_Register()) { 1311 __ pop(args[i].first()->as_Register()); 1312 } else if (args[i].first()->is_XMMRegister()) { 1313 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0)); 1314 __ addptr(rsp, 2*wordSize); 1315 } 1316 } 1317 } 1318 1319 static void verify_oop_args(MacroAssembler* masm, 1320 const methodHandle& method, 1321 const BasicType* sig_bt, 1322 const VMRegPair* regs) { 1323 Register temp_reg = rbx; // not part of any compiled calling seq 1324 if (VerifyOops) { 1325 for (int i = 0; i < method->size_of_parameters(); i++) { 1326 if (is_reference_type(sig_bt[i])) { 1327 VMReg r = regs[i].first(); 1328 assert(r->is_valid(), "bad oop arg"); 1329 if (r->is_stack()) { 1330 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1331 __ verify_oop(temp_reg); 1332 } else { 1333 __ verify_oop(r->as_Register()); 1334 } 1335 } 1336 } 1337 } 1338 } 1339 1340 static void check_continuation_enter_argument(VMReg actual_vmreg, 1341 Register expected_reg, 1342 const char* name) { 1343 assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name); 1344 assert(actual_vmreg->as_Register() == expected_reg, 1345 "%s is in unexpected register: %s instead of %s", 1346 name, actual_vmreg->as_Register()->name(), expected_reg->name()); 1347 } 1348 1349 1350 //---------------------------- continuation_enter_setup --------------------------- 1351 // 1352 // Arguments: 1353 // None. 1354 // 1355 // Results: 1356 // rsp: pointer to blank ContinuationEntry 1357 // 1358 // Kills: 1359 // rax 1360 // 1361 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) { 1362 assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, ""); 1363 assert(in_bytes(ContinuationEntry::cont_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1364 assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1365 1366 stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize; 1367 __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1368 1369 int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size; 1370 OopMap* map = new OopMap(frame_size, 0); 1371 1372 __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset())); 1373 __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax); 1374 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp); 1375 1376 return map; 1377 } 1378 1379 //---------------------------- fill_continuation_entry --------------------------- 1380 // 1381 // Arguments: 1382 // rsp: pointer to blank Continuation entry 1383 // reg_cont_obj: pointer to the continuation 1384 // reg_flags: flags 1385 // 1386 // Results: 1387 // rsp: pointer to filled out ContinuationEntry 1388 // 1389 // Kills: 1390 // rax 1391 // 1392 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) { 1393 assert_different_registers(rax, reg_cont_obj, reg_flags); 1394 #ifdef ASSERT 1395 __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value()); 1396 #endif 1397 __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj); 1398 __ movl (Address(rsp, ContinuationEntry::flags_offset()), reg_flags); 1399 __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0); 1400 __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0); 1401 __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0); 1402 1403 __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset())); 1404 __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax); 1405 __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset())); 1406 __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax); 1407 1408 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0); 1409 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0); 1410 } 1411 1412 //---------------------------- continuation_enter_cleanup --------------------------- 1413 // 1414 // Arguments: 1415 // rsp: pointer to the ContinuationEntry 1416 // 1417 // Results: 1418 // rsp: pointer to the spilled rbp in the entry frame 1419 // 1420 // Kills: 1421 // rbx 1422 // 1423 void static continuation_enter_cleanup(MacroAssembler* masm) { 1424 #ifdef ASSERT 1425 Label L_good_sp; 1426 __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1427 __ jcc(Assembler::equal, L_good_sp); 1428 __ stop("Incorrect rsp at continuation_enter_cleanup"); 1429 __ bind(L_good_sp); 1430 #endif 1431 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset())); 1432 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx); 1433 1434 if (CheckJNICalls) { 1435 // Check if this is a virtual thread continuation 1436 Label L_skip_vthread_code; 1437 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0); 1438 __ jcc(Assembler::equal, L_skip_vthread_code); 1439 1440 // If the held monitor count is > 0 and this vthread is terminating then 1441 // it failed to release a JNI monitor. So we issue the same log message 1442 // that JavaThread::exit does. 1443 __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1444 __ jcc(Assembler::equal, L_skip_vthread_code); 1445 1446 // rax may hold an exception oop, save it before the call 1447 __ push(rax); 1448 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held)); 1449 __ pop(rax); 1450 1451 // For vthreads we have to explicitly zero the JNI monitor count of the carrier 1452 // on termination. The held count is implicitly zeroed below when we restore from 1453 // the parent held count (which has to be zero). 1454 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1455 1456 __ bind(L_skip_vthread_code); 1457 } 1458 #ifdef ASSERT 1459 else { 1460 // Check if this is a virtual thread continuation 1461 Label L_skip_vthread_code; 1462 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0); 1463 __ jcc(Assembler::equal, L_skip_vthread_code); 1464 1465 // See comment just above. If not checking JNI calls the JNI count is only 1466 // needed for assertion checking. 1467 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1468 1469 __ bind(L_skip_vthread_code); 1470 } 1471 #endif 1472 1473 __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset())); 1474 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx); 1475 1476 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset())); 1477 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx); 1478 __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1479 } 1480 1481 static void gen_continuation_enter(MacroAssembler* masm, 1482 const VMRegPair* regs, 1483 int& exception_offset, 1484 OopMapSet* oop_maps, 1485 int& frame_complete, 1486 int& stack_slots, 1487 int& interpreted_entry_offset, 1488 int& compiled_entry_offset) { 1489 1490 // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread) 1491 int pos_cont_obj = 0; 1492 int pos_is_cont = 1; 1493 int pos_is_virtual = 2; 1494 1495 // The platform-specific calling convention may present the arguments in various registers. 1496 // To simplify the rest of the code, we expect the arguments to reside at these known 1497 // registers, and we additionally check the placement here in case calling convention ever 1498 // changes. 1499 Register reg_cont_obj = c_rarg1; 1500 Register reg_is_cont = c_rarg2; 1501 Register reg_is_virtual = c_rarg3; 1502 1503 check_continuation_enter_argument(regs[pos_cont_obj].first(), reg_cont_obj, "Continuation object"); 1504 check_continuation_enter_argument(regs[pos_is_cont].first(), reg_is_cont, "isContinue"); 1505 check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread"); 1506 1507 // Utility methods kill rax, make sure there are no collisions 1508 assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual); 1509 1510 AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(), 1511 relocInfo::static_call_type); 1512 1513 address start = __ pc(); 1514 1515 Label L_thaw, L_exit; 1516 1517 // i2i entry used at interp_only_mode only 1518 interpreted_entry_offset = __ pc() - start; 1519 { 1520 #ifdef ASSERT 1521 Label is_interp_only; 1522 __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0); 1523 __ jcc(Assembler::notEqual, is_interp_only); 1524 __ stop("enterSpecial interpreter entry called when not in interp_only_mode"); 1525 __ bind(is_interp_only); 1526 #endif 1527 1528 __ pop(rax); // return address 1529 // Read interpreter arguments into registers (this is an ad-hoc i2c adapter) 1530 __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2)); 1531 __ movl(c_rarg2, Address(rsp, Interpreter::stackElementSize*1)); 1532 __ movl(c_rarg3, Address(rsp, Interpreter::stackElementSize*0)); 1533 __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment 1534 __ push(rax); // return address 1535 __ push_cont_fastpath(); 1536 1537 __ enter(); 1538 1539 stack_slots = 2; // will be adjusted in setup 1540 OopMap* map = continuation_enter_setup(masm, stack_slots); 1541 // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe, 1542 // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway. 1543 1544 __ verify_oop(reg_cont_obj); 1545 1546 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1547 1548 // If continuation, call to thaw. Otherwise, resolve the call and exit. 1549 __ testptr(reg_is_cont, reg_is_cont); 1550 __ jcc(Assembler::notZero, L_thaw); 1551 1552 // --- Resolve path 1553 1554 // Make sure the call is patchable 1555 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1556 // Emit stub for static call 1557 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc()); 1558 if (stub == nullptr) { 1559 fatal("CodeCache is full at gen_continuation_enter"); 1560 } 1561 __ call(resolve); 1562 oop_maps->add_gc_map(__ pc() - start, map); 1563 __ post_call_nop(); 1564 1565 __ jmp(L_exit); 1566 } 1567 1568 // compiled entry 1569 __ align(CodeEntryAlignment); 1570 compiled_entry_offset = __ pc() - start; 1571 __ enter(); 1572 1573 stack_slots = 2; // will be adjusted in setup 1574 OopMap* map = continuation_enter_setup(masm, stack_slots); 1575 1576 // Frame is now completed as far as size and linkage. 1577 frame_complete = __ pc() - start; 1578 1579 __ verify_oop(reg_cont_obj); 1580 1581 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1582 1583 // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue) 1584 __ testptr(reg_is_cont, reg_is_cont); 1585 __ jccb(Assembler::notZero, L_thaw); 1586 1587 // --- call Continuation.enter(Continuation c, boolean isContinue) 1588 1589 // Make sure the call is patchable 1590 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1591 1592 // Emit stub for static call 1593 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc()); 1594 if (stub == nullptr) { 1595 fatal("CodeCache is full at gen_continuation_enter"); 1596 } 1597 1598 // The call needs to be resolved. There's a special case for this in 1599 // SharedRuntime::find_callee_info_helper() which calls 1600 // LinkResolver::resolve_continuation_enter() which resolves the call to 1601 // Continuation.enter(Continuation c, boolean isContinue). 1602 __ call(resolve); 1603 1604 oop_maps->add_gc_map(__ pc() - start, map); 1605 __ post_call_nop(); 1606 1607 __ jmpb(L_exit); 1608 1609 // --- Thawing path 1610 1611 __ bind(L_thaw); 1612 1613 __ call(RuntimeAddress(StubRoutines::cont_thaw())); 1614 1615 ContinuationEntry::_return_pc_offset = __ pc() - start; 1616 oop_maps->add_gc_map(__ pc() - start, map->deep_copy()); 1617 __ post_call_nop(); 1618 1619 // --- Normal exit (resolve/thawing) 1620 1621 __ bind(L_exit); 1622 1623 continuation_enter_cleanup(masm); 1624 __ pop(rbp); 1625 __ ret(0); 1626 1627 // --- Exception handling path 1628 1629 exception_offset = __ pc() - start; 1630 1631 continuation_enter_cleanup(masm); 1632 __ pop(rbp); 1633 1634 __ movptr(c_rarg0, r15_thread); 1635 __ movptr(c_rarg1, Address(rsp, 0)); // return address 1636 1637 // rax still holds the original exception oop, save it before the call 1638 __ push(rax); 1639 1640 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2); 1641 __ movptr(rbx, rax); 1642 1643 // Continue at exception handler: 1644 // rax: exception oop 1645 // rbx: exception handler 1646 // rdx: exception pc 1647 __ pop(rax); 1648 __ verify_oop(rax); 1649 __ pop(rdx); 1650 __ jmp(rbx); 1651 } 1652 1653 static void gen_continuation_yield(MacroAssembler* masm, 1654 const VMRegPair* regs, 1655 OopMapSet* oop_maps, 1656 int& frame_complete, 1657 int& stack_slots, 1658 int& compiled_entry_offset) { 1659 enum layout { 1660 rbp_off, 1661 rbpH_off, 1662 return_off, 1663 return_off2, 1664 framesize // inclusive of return address 1665 }; 1666 stack_slots = framesize / VMRegImpl::slots_per_word; 1667 assert(stack_slots == 2, "recheck layout"); 1668 1669 address start = __ pc(); 1670 compiled_entry_offset = __ pc() - start; 1671 __ enter(); 1672 address the_pc = __ pc(); 1673 1674 frame_complete = the_pc - start; 1675 1676 // This nop must be exactly at the PC we push into the frame info. 1677 // We use this nop for fast CodeBlob lookup, associate the OopMap 1678 // with it right away. 1679 __ post_call_nop(); 1680 OopMap* map = new OopMap(framesize, 1); 1681 oop_maps->add_gc_map(frame_complete, map); 1682 1683 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 1684 __ movptr(c_rarg0, r15_thread); 1685 __ movptr(c_rarg1, rsp); 1686 __ call_VM_leaf(Continuation::freeze_entry(), 2); 1687 __ reset_last_Java_frame(true); 1688 1689 Label L_pinned; 1690 1691 __ testptr(rax, rax); 1692 __ jcc(Assembler::notZero, L_pinned); 1693 1694 __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1695 continuation_enter_cleanup(masm); 1696 __ pop(rbp); 1697 __ ret(0); 1698 1699 __ bind(L_pinned); 1700 1701 // Pinned, return to caller 1702 1703 // handle pending exception thrown by freeze 1704 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 1705 Label ok; 1706 __ jcc(Assembler::equal, ok); 1707 __ leave(); 1708 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 1709 __ bind(ok); 1710 1711 __ leave(); 1712 __ ret(0); 1713 } 1714 1715 static void gen_special_dispatch(MacroAssembler* masm, 1716 const methodHandle& method, 1717 const BasicType* sig_bt, 1718 const VMRegPair* regs) { 1719 verify_oop_args(masm, method, sig_bt, regs); 1720 vmIntrinsics::ID iid = method->intrinsic_id(); 1721 1722 // Now write the args into the outgoing interpreter space 1723 bool has_receiver = false; 1724 Register receiver_reg = noreg; 1725 int member_arg_pos = -1; 1726 Register member_reg = noreg; 1727 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid); 1728 if (ref_kind != 0) { 1729 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument 1730 member_reg = rbx; // known to be free at this point 1731 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind); 1732 } else if (iid == vmIntrinsics::_invokeBasic) { 1733 has_receiver = true; 1734 } else if (iid == vmIntrinsics::_linkToNative) { 1735 member_arg_pos = method->size_of_parameters() - 1; // trailing NativeEntryPoint argument 1736 member_reg = rbx; // known to be free at this point 1737 } else { 1738 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid)); 1739 } 1740 1741 if (member_reg != noreg) { 1742 // Load the member_arg into register, if necessary. 1743 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs); 1744 VMReg r = regs[member_arg_pos].first(); 1745 if (r->is_stack()) { 1746 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1747 } else { 1748 // no data motion is needed 1749 member_reg = r->as_Register(); 1750 } 1751 } 1752 1753 if (has_receiver) { 1754 // Make sure the receiver is loaded into a register. 1755 assert(method->size_of_parameters() > 0, "oob"); 1756 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object"); 1757 VMReg r = regs[0].first(); 1758 assert(r->is_valid(), "bad receiver arg"); 1759 if (r->is_stack()) { 1760 // Porting note: This assumes that compiled calling conventions always 1761 // pass the receiver oop in a register. If this is not true on some 1762 // platform, pick a temp and load the receiver from stack. 1763 fatal("receiver always in a register"); 1764 receiver_reg = j_rarg0; // known to be free at this point 1765 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1766 } else { 1767 // no data motion is needed 1768 receiver_reg = r->as_Register(); 1769 } 1770 } 1771 1772 // Figure out which address we are really jumping to: 1773 MethodHandles::generate_method_handle_dispatch(masm, iid, 1774 receiver_reg, member_reg, /*for_compiler_entry:*/ true); 1775 } 1776 1777 // --------------------------------------------------------------------------- 1778 // Generate a native wrapper for a given method. The method takes arguments 1779 // in the Java compiled code convention, marshals them to the native 1780 // convention (handlizes oops, etc), transitions to native, makes the call, 1781 // returns to java state (possibly blocking), unhandlizes any result and 1782 // returns. 1783 // 1784 // Critical native functions are a shorthand for the use of 1785 // GetPrimtiveArrayCritical and disallow the use of any other JNI 1786 // functions. The wrapper is expected to unpack the arguments before 1787 // passing them to the callee. Critical native functions leave the state _in_Java, 1788 // since they cannot stop for GC. 1789 // Some other parts of JNI setup are skipped like the tear down of the JNI handle 1790 // block and the check for pending exceptions it's impossible for them 1791 // to be thrown. 1792 // 1793 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm, 1794 const methodHandle& method, 1795 int compile_id, 1796 BasicType* in_sig_bt, 1797 VMRegPair* in_regs, 1798 BasicType ret_type) { 1799 if (method->is_continuation_native_intrinsic()) { 1800 int exception_offset = -1; 1801 OopMapSet* oop_maps = new OopMapSet(); 1802 int frame_complete = -1; 1803 int stack_slots = -1; 1804 int interpreted_entry_offset = -1; 1805 int vep_offset = -1; 1806 if (method->is_continuation_enter_intrinsic()) { 1807 gen_continuation_enter(masm, 1808 in_regs, 1809 exception_offset, 1810 oop_maps, 1811 frame_complete, 1812 stack_slots, 1813 interpreted_entry_offset, 1814 vep_offset); 1815 } else if (method->is_continuation_yield_intrinsic()) { 1816 gen_continuation_yield(masm, 1817 in_regs, 1818 oop_maps, 1819 frame_complete, 1820 stack_slots, 1821 vep_offset); 1822 } else { 1823 guarantee(false, "Unknown Continuation native intrinsic"); 1824 } 1825 1826 #ifdef ASSERT 1827 if (method->is_continuation_enter_intrinsic()) { 1828 assert(interpreted_entry_offset != -1, "Must be set"); 1829 assert(exception_offset != -1, "Must be set"); 1830 } else { 1831 assert(interpreted_entry_offset == -1, "Must be unset"); 1832 assert(exception_offset == -1, "Must be unset"); 1833 } 1834 assert(frame_complete != -1, "Must be set"); 1835 assert(stack_slots != -1, "Must be set"); 1836 assert(vep_offset != -1, "Must be set"); 1837 #endif 1838 1839 __ flush(); 1840 nmethod* nm = nmethod::new_native_nmethod(method, 1841 compile_id, 1842 masm->code(), 1843 vep_offset, 1844 frame_complete, 1845 stack_slots, 1846 in_ByteSize(-1), 1847 in_ByteSize(-1), 1848 oop_maps, 1849 exception_offset); 1850 if (nm == nullptr) return nm; 1851 if (method->is_continuation_enter_intrinsic()) { 1852 ContinuationEntry::set_enter_code(nm, interpreted_entry_offset); 1853 } else if (method->is_continuation_yield_intrinsic()) { 1854 _cont_doYield_stub = nm; 1855 } 1856 return nm; 1857 } 1858 1859 if (method->is_method_handle_intrinsic()) { 1860 vmIntrinsics::ID iid = method->intrinsic_id(); 1861 intptr_t start = (intptr_t)__ pc(); 1862 int vep_offset = ((intptr_t)__ pc()) - start; 1863 gen_special_dispatch(masm, 1864 method, 1865 in_sig_bt, 1866 in_regs); 1867 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period 1868 __ flush(); 1869 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually 1870 return nmethod::new_native_nmethod(method, 1871 compile_id, 1872 masm->code(), 1873 vep_offset, 1874 frame_complete, 1875 stack_slots / VMRegImpl::slots_per_word, 1876 in_ByteSize(-1), 1877 in_ByteSize(-1), 1878 nullptr); 1879 } 1880 address native_func = method->native_function(); 1881 assert(native_func != nullptr, "must have function"); 1882 1883 // An OopMap for lock (and class if static) 1884 OopMapSet *oop_maps = new OopMapSet(); 1885 intptr_t start = (intptr_t)__ pc(); 1886 1887 // We have received a description of where all the java arg are located 1888 // on entry to the wrapper. We need to convert these args to where 1889 // the jni function will expect them. To figure out where they go 1890 // we convert the java signature to a C signature by inserting 1891 // the hidden arguments as arg[0] and possibly arg[1] (static method) 1892 1893 const int total_in_args = method->size_of_parameters(); 1894 int total_c_args = total_in_args + (method->is_static() ? 2 : 1); 1895 1896 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args); 1897 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args); 1898 BasicType* in_elem_bt = nullptr; 1899 1900 int argc = 0; 1901 out_sig_bt[argc++] = T_ADDRESS; 1902 if (method->is_static()) { 1903 out_sig_bt[argc++] = T_OBJECT; 1904 } 1905 1906 for (int i = 0; i < total_in_args ; i++ ) { 1907 out_sig_bt[argc++] = in_sig_bt[i]; 1908 } 1909 1910 // Now figure out where the args must be stored and how much stack space 1911 // they require. 1912 int out_arg_slots; 1913 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args); 1914 1915 // Compute framesize for the wrapper. We need to handlize all oops in 1916 // incoming registers 1917 1918 // Calculate the total number of stack slots we will need. 1919 1920 // First count the abi requirement plus all of the outgoing args 1921 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots; 1922 1923 // Now the space for the inbound oop handle area 1924 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers 1925 1926 int oop_handle_offset = stack_slots; 1927 stack_slots += total_save_slots; 1928 1929 // Now any space we need for handlizing a klass if static method 1930 1931 int klass_slot_offset = 0; 1932 int klass_offset = -1; 1933 int lock_slot_offset = 0; 1934 bool is_static = false; 1935 1936 if (method->is_static()) { 1937 klass_slot_offset = stack_slots; 1938 stack_slots += VMRegImpl::slots_per_word; 1939 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size; 1940 is_static = true; 1941 } 1942 1943 // Plus a lock if needed 1944 1945 if (method->is_synchronized()) { 1946 lock_slot_offset = stack_slots; 1947 stack_slots += VMRegImpl::slots_per_word; 1948 } 1949 1950 // Now a place (+2) to save return values or temp during shuffling 1951 // + 4 for return address (which we own) and saved rbp 1952 stack_slots += 6; 1953 1954 // Ok The space we have allocated will look like: 1955 // 1956 // 1957 // FP-> | | 1958 // |---------------------| 1959 // | 2 slots for moves | 1960 // |---------------------| 1961 // | lock box (if sync) | 1962 // |---------------------| <- lock_slot_offset 1963 // | klass (if static) | 1964 // |---------------------| <- klass_slot_offset 1965 // | oopHandle area | 1966 // |---------------------| <- oop_handle_offset (6 java arg registers) 1967 // | outbound memory | 1968 // | based arguments | 1969 // | | 1970 // |---------------------| 1971 // | | 1972 // SP-> | out_preserved_slots | 1973 // 1974 // 1975 1976 1977 // Now compute actual number of stack words we need rounding to make 1978 // stack properly aligned. 1979 stack_slots = align_up(stack_slots, StackAlignmentInSlots); 1980 1981 int stack_size = stack_slots * VMRegImpl::stack_slot_size; 1982 1983 // First thing make an ic check to see if we should even be here 1984 1985 // We are free to use all registers as temps without saving them and 1986 // restoring them except rbp. rbp is the only callee save register 1987 // as far as the interpreter and the compiler(s) are concerned. 1988 1989 const Register receiver = j_rarg0; 1990 1991 Label exception_pending; 1992 1993 assert_different_registers(receiver, rscratch1, rscratch2); 1994 __ verify_oop(receiver); 1995 __ ic_check(8 /* end_alignment */); 1996 1997 int vep_offset = ((intptr_t)__ pc()) - start; 1998 1999 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) { 2000 Label L_skip_barrier; 2001 Register klass = r10; 2002 __ mov_metadata(klass, method->method_holder()); // InstanceKlass* 2003 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 2004 2005 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 2006 2007 __ bind(L_skip_barrier); 2008 } 2009 2010 #ifdef COMPILER1 2011 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available. 2012 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) { 2013 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/); 2014 } 2015 #endif // COMPILER1 2016 2017 // The instruction at the verified entry point must be 5 bytes or longer 2018 // because it can be patched on the fly by make_non_entrant. The stack bang 2019 // instruction fits that requirement. 2020 2021 // Generate stack overflow check 2022 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size()); 2023 2024 // Generate a new frame for the wrapper. 2025 __ enter(); 2026 // -2 because return address is already present and so is saved rbp 2027 __ subptr(rsp, stack_size - 2*wordSize); 2028 2029 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2030 // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub 2031 bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */); 2032 2033 // Frame is now completed as far as size and linkage. 2034 int frame_complete = ((intptr_t)__ pc()) - start; 2035 2036 #ifdef ASSERT 2037 __ check_stack_alignment(rsp, "improperly aligned stack"); 2038 #endif /* ASSERT */ 2039 2040 2041 // We use r14 as the oop handle for the receiver/klass 2042 // It is callee save so it survives the call to native 2043 2044 const Register oop_handle_reg = r14; 2045 2046 // 2047 // We immediately shuffle the arguments so that any vm call we have to 2048 // make from here on out (sync slow path, jvmti, etc.) we will have 2049 // captured the oops from our caller and have a valid oopMap for 2050 // them. 2051 2052 // ----------------- 2053 // The Grand Shuffle 2054 2055 // The Java calling convention is either equal (linux) or denser (win64) than the 2056 // c calling convention. However the because of the jni_env argument the c calling 2057 // convention always has at least one more (and two for static) arguments than Java. 2058 // Therefore if we move the args from java -> c backwards then we will never have 2059 // a register->register conflict and we don't have to build a dependency graph 2060 // and figure out how to break any cycles. 2061 // 2062 2063 // Record esp-based slot for receiver on stack for non-static methods 2064 int receiver_offset = -1; 2065 2066 // This is a trick. We double the stack slots so we can claim 2067 // the oops in the caller's frame. Since we are sure to have 2068 // more args than the caller doubling is enough to make 2069 // sure we can capture all the incoming oop args from the 2070 // caller. 2071 // 2072 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/); 2073 2074 // Mark location of rbp (someday) 2075 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp)); 2076 2077 // Use eax, ebx as temporaries during any memory-memory moves we have to do 2078 // All inbound args are referenced based on rbp and all outbound args via rsp. 2079 2080 2081 #ifdef ASSERT 2082 bool reg_destroyed[Register::number_of_registers]; 2083 bool freg_destroyed[XMMRegister::number_of_registers]; 2084 for ( int r = 0 ; r < Register::number_of_registers ; r++ ) { 2085 reg_destroyed[r] = false; 2086 } 2087 for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) { 2088 freg_destroyed[f] = false; 2089 } 2090 2091 #endif /* ASSERT */ 2092 2093 // For JNI natives the incoming and outgoing registers are offset upwards. 2094 GrowableArray<int> arg_order(2 * total_in_args); 2095 2096 VMRegPair tmp_vmreg; 2097 tmp_vmreg.set2(rbx->as_VMReg()); 2098 2099 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 2100 arg_order.push(i); 2101 arg_order.push(c_arg); 2102 } 2103 2104 int temploc = -1; 2105 for (int ai = 0; ai < arg_order.length(); ai += 2) { 2106 int i = arg_order.at(ai); 2107 int c_arg = arg_order.at(ai + 1); 2108 __ block_comment(err_msg("move %d -> %d", i, c_arg)); 2109 #ifdef ASSERT 2110 if (in_regs[i].first()->is_Register()) { 2111 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!"); 2112 } else if (in_regs[i].first()->is_XMMRegister()) { 2113 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!"); 2114 } 2115 if (out_regs[c_arg].first()->is_Register()) { 2116 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 2117 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 2118 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 2119 } 2120 #endif /* ASSERT */ 2121 switch (in_sig_bt[i]) { 2122 case T_ARRAY: 2123 case T_OBJECT: 2124 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg], 2125 ((i == 0) && (!is_static)), 2126 &receiver_offset); 2127 break; 2128 case T_VOID: 2129 break; 2130 2131 case T_FLOAT: 2132 __ float_move(in_regs[i], out_regs[c_arg]); 2133 break; 2134 2135 case T_DOUBLE: 2136 assert( i + 1 < total_in_args && 2137 in_sig_bt[i + 1] == T_VOID && 2138 out_sig_bt[c_arg+1] == T_VOID, "bad arg list"); 2139 __ double_move(in_regs[i], out_regs[c_arg]); 2140 break; 2141 2142 case T_LONG : 2143 __ long_move(in_regs[i], out_regs[c_arg]); 2144 break; 2145 2146 case T_ADDRESS: assert(false, "found T_ADDRESS in java args"); 2147 2148 default: 2149 __ move32_64(in_regs[i], out_regs[c_arg]); 2150 } 2151 } 2152 2153 int c_arg; 2154 2155 // Pre-load a static method's oop into r14. Used both by locking code and 2156 // the normal JNI call code. 2157 // point c_arg at the first arg that is already loaded in case we 2158 // need to spill before we call out 2159 c_arg = total_c_args - total_in_args; 2160 2161 if (method->is_static()) { 2162 2163 // load oop into a register 2164 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror())); 2165 2166 // Now handlize the static class mirror it's known not-null. 2167 __ movptr(Address(rsp, klass_offset), oop_handle_reg); 2168 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset)); 2169 2170 // Now get the handle 2171 __ lea(oop_handle_reg, Address(rsp, klass_offset)); 2172 // store the klass handle as second argument 2173 __ movptr(c_rarg1, oop_handle_reg); 2174 // and protect the arg if we must spill 2175 c_arg--; 2176 } 2177 2178 // Change state to native (we save the return address in the thread, since it might not 2179 // be pushed on the stack when we do a stack traversal). It is enough that the pc() 2180 // points into the right code segment. It does not have to be the correct return pc. 2181 // We use the same pc/oopMap repeatedly when we call out 2182 2183 intptr_t the_pc = (intptr_t) __ pc(); 2184 oop_maps->add_gc_map(the_pc - start, map); 2185 2186 __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1); 2187 2188 2189 // We have all of the arguments setup at this point. We must not touch any register 2190 // argument registers at this point (what if we save/restore them there are no oop? 2191 2192 if (DTraceMethodProbes) { 2193 // protect the args we've loaded 2194 save_args(masm, total_c_args, c_arg, out_regs); 2195 __ mov_metadata(c_rarg1, method()); 2196 __ call_VM_leaf( 2197 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry), 2198 r15_thread, c_rarg1); 2199 restore_args(masm, total_c_args, c_arg, out_regs); 2200 } 2201 2202 // RedefineClasses() tracing support for obsolete method entry 2203 if (log_is_enabled(Trace, redefine, class, obsolete)) { 2204 // protect the args we've loaded 2205 save_args(masm, total_c_args, c_arg, out_regs); 2206 __ mov_metadata(c_rarg1, method()); 2207 __ call_VM_leaf( 2208 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry), 2209 r15_thread, c_rarg1); 2210 restore_args(masm, total_c_args, c_arg, out_regs); 2211 } 2212 2213 // Lock a synchronized method 2214 2215 // Register definitions used by locking and unlocking 2216 2217 const Register swap_reg = rax; // Must use rax for cmpxchg instruction 2218 const Register obj_reg = rbx; // Will contain the oop 2219 const Register lock_reg = r13; // Address of compiler lock object (BasicLock) 2220 const Register old_hdr = r13; // value of old header at unlock time 2221 2222 Label slow_path_lock; 2223 Label lock_done; 2224 2225 if (method->is_synchronized()) { 2226 Label count_mon; 2227 2228 const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes(); 2229 2230 // Get the handle (the 2nd argument) 2231 __ mov(oop_handle_reg, c_rarg1); 2232 2233 // Get address of the box 2234 2235 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2236 2237 // Load the oop from the handle 2238 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2239 2240 if (LockingMode == LM_MONITOR) { 2241 __ jmp(slow_path_lock); 2242 } else if (LockingMode == LM_LEGACY) { 2243 // Load immediate 1 into swap_reg %rax 2244 __ movl(swap_reg, 1); 2245 2246 // Load (object->mark() | 1) into swap_reg %rax 2247 __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2248 2249 // Save (object->mark() | 1) into BasicLock's displaced header 2250 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2251 2252 // src -> dest iff dest == rax else rax <- dest 2253 __ lock(); 2254 __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2255 __ jcc(Assembler::equal, count_mon); 2256 2257 // Hmm should this move to the slow path code area??? 2258 2259 // Test if the oopMark is an obvious stack pointer, i.e., 2260 // 1) (mark & 3) == 0, and 2261 // 2) rsp <= mark < mark + os::pagesize() 2262 // These 3 tests can be done by evaluating the following 2263 // expression: ((mark - rsp) & (3 - os::vm_page_size())), 2264 // assuming both stack pointer and pagesize have their 2265 // least significant 2 bits clear. 2266 // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg 2267 2268 __ subptr(swap_reg, rsp); 2269 __ andptr(swap_reg, 3 - (int)os::vm_page_size()); 2270 2271 // Save the test result, for recursive case, the result is zero 2272 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2273 __ jcc(Assembler::notEqual, slow_path_lock); 2274 } else { 2275 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2276 __ lightweight_lock(lock_reg, obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock); 2277 } 2278 __ bind(count_mon); 2279 __ inc_held_monitor_count(); 2280 2281 // Slow path will re-enter here 2282 __ bind(lock_done); 2283 } 2284 2285 // Finally just about ready to make the JNI call 2286 2287 // get JNIEnv* which is first argument to native 2288 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset()))); 2289 2290 // Now set thread in native 2291 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 2292 2293 __ call(RuntimeAddress(native_func)); 2294 2295 // Verify or restore cpu control state after JNI call 2296 __ restore_cpu_control_state_after_jni(rscratch1); 2297 2298 // Unpack native results. 2299 switch (ret_type) { 2300 case T_BOOLEAN: __ c2bool(rax); break; 2301 case T_CHAR : __ movzwl(rax, rax); break; 2302 case T_BYTE : __ sign_extend_byte (rax); break; 2303 case T_SHORT : __ sign_extend_short(rax); break; 2304 case T_INT : /* nothing to do */ break; 2305 case T_DOUBLE : 2306 case T_FLOAT : 2307 // Result is in xmm0 we'll save as needed 2308 break; 2309 case T_ARRAY: // Really a handle 2310 case T_OBJECT: // Really a handle 2311 break; // can't de-handlize until after safepoint check 2312 case T_VOID: break; 2313 case T_LONG: break; 2314 default : ShouldNotReachHere(); 2315 } 2316 2317 Label after_transition; 2318 2319 // Switch thread to "native transition" state before reading the synchronization state. 2320 // This additional state is necessary because reading and testing the synchronization 2321 // state is not atomic w.r.t. GC, as this scenario demonstrates: 2322 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted. 2323 // VM thread changes sync state to synchronizing and suspends threads for GC. 2324 // Thread A is resumed to finish this native method, but doesn't block here since it 2325 // didn't see any synchronization is progress, and escapes. 2326 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 2327 2328 // Force this write out before the read below 2329 if (!UseSystemMemoryBarrier) { 2330 __ membar(Assembler::Membar_mask_bits( 2331 Assembler::LoadLoad | Assembler::LoadStore | 2332 Assembler::StoreLoad | Assembler::StoreStore)); 2333 } 2334 2335 // check for safepoint operation in progress and/or pending suspend requests 2336 { 2337 Label Continue; 2338 Label slow_path; 2339 2340 __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */); 2341 2342 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2343 __ jcc(Assembler::equal, Continue); 2344 __ bind(slow_path); 2345 2346 // Don't use call_VM as it will see a possible pending exception and forward it 2347 // and never return here preventing us from clearing _last_native_pc down below. 2348 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are 2349 // preserved and correspond to the bcp/locals pointers. So we do a runtime call 2350 // by hand. 2351 // 2352 __ vzeroupper(); 2353 save_native_result(masm, ret_type, stack_slots); 2354 __ mov(c_rarg0, r15_thread); 2355 __ mov(r12, rsp); // remember sp 2356 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2357 __ andptr(rsp, -16); // align stack as required by ABI 2358 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 2359 __ mov(rsp, r12); // restore sp 2360 __ reinit_heapbase(); 2361 // Restore any method result value 2362 restore_native_result(masm, ret_type, stack_slots); 2363 __ bind(Continue); 2364 } 2365 2366 // change thread state 2367 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 2368 __ bind(after_transition); 2369 2370 Label reguard; 2371 Label reguard_done; 2372 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 2373 __ jcc(Assembler::equal, reguard); 2374 __ bind(reguard_done); 2375 2376 // native result if any is live 2377 2378 // Unlock 2379 Label slow_path_unlock; 2380 Label unlock_done; 2381 if (method->is_synchronized()) { 2382 2383 Label fast_done; 2384 2385 // Get locked oop from the handle we passed to jni 2386 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2387 2388 if (LockingMode == LM_LEGACY) { 2389 Label not_recur; 2390 // Simple recursive lock? 2391 __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD); 2392 __ jcc(Assembler::notEqual, not_recur); 2393 __ dec_held_monitor_count(); 2394 __ jmpb(fast_done); 2395 __ bind(not_recur); 2396 } 2397 2398 // Must save rax if it is live now because cmpxchg must use it 2399 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2400 save_native_result(masm, ret_type, stack_slots); 2401 } 2402 2403 if (LockingMode == LM_MONITOR) { 2404 __ jmp(slow_path_unlock); 2405 } else if (LockingMode == LM_LEGACY) { 2406 // get address of the stack lock 2407 __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2408 // get old displaced header 2409 __ movptr(old_hdr, Address(rax, 0)); 2410 2411 // Atomic swap old header if oop still contains the stack lock 2412 __ lock(); 2413 __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2414 __ jcc(Assembler::notEqual, slow_path_unlock); 2415 __ dec_held_monitor_count(); 2416 } else { 2417 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2418 __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock); 2419 __ dec_held_monitor_count(); 2420 } 2421 2422 // slow path re-enters here 2423 __ bind(unlock_done); 2424 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2425 restore_native_result(masm, ret_type, stack_slots); 2426 } 2427 2428 __ bind(fast_done); 2429 } 2430 if (DTraceMethodProbes) { 2431 save_native_result(masm, ret_type, stack_slots); 2432 __ mov_metadata(c_rarg1, method()); 2433 __ call_VM_leaf( 2434 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit), 2435 r15_thread, c_rarg1); 2436 restore_native_result(masm, ret_type, stack_slots); 2437 } 2438 2439 __ reset_last_Java_frame(false); 2440 2441 // Unbox oop result, e.g. JNIHandles::resolve value. 2442 if (is_reference_type(ret_type)) { 2443 __ resolve_jobject(rax /* value */, 2444 r15_thread /* thread */, 2445 rcx /* tmp */); 2446 } 2447 2448 if (CheckJNICalls) { 2449 // clear_pending_jni_exception_check 2450 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD); 2451 } 2452 2453 // reset handle block 2454 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset())); 2455 __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD); 2456 2457 // pop our frame 2458 2459 __ leave(); 2460 2461 // Any exception pending? 2462 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2463 __ jcc(Assembler::notEqual, exception_pending); 2464 2465 // Return 2466 2467 __ ret(0); 2468 2469 // Unexpected paths are out of line and go here 2470 2471 // forward the exception 2472 __ bind(exception_pending); 2473 2474 // and forward the exception 2475 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 2476 2477 // Slow path locking & unlocking 2478 if (method->is_synchronized()) { 2479 2480 // BEGIN Slow path lock 2481 __ bind(slow_path_lock); 2482 2483 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM 2484 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2485 2486 // protect the args we've loaded 2487 save_args(masm, total_c_args, c_arg, out_regs); 2488 2489 __ mov(c_rarg0, obj_reg); 2490 __ mov(c_rarg1, lock_reg); 2491 __ mov(c_rarg2, r15_thread); 2492 2493 // Not a leaf but we have last_Java_frame setup as we want 2494 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3); 2495 restore_args(masm, total_c_args, c_arg, out_regs); 2496 2497 #ifdef ASSERT 2498 { Label L; 2499 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2500 __ jcc(Assembler::equal, L); 2501 __ stop("no pending exception allowed on exit from monitorenter"); 2502 __ bind(L); 2503 } 2504 #endif 2505 __ jmp(lock_done); 2506 2507 // END Slow path lock 2508 2509 // BEGIN Slow path unlock 2510 __ bind(slow_path_unlock); 2511 2512 // If we haven't already saved the native result we must save it now as xmm registers 2513 // are still exposed. 2514 __ vzeroupper(); 2515 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2516 save_native_result(masm, ret_type, stack_slots); 2517 } 2518 2519 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2520 2521 __ mov(c_rarg0, obj_reg); 2522 __ mov(c_rarg2, r15_thread); 2523 __ mov(r12, rsp); // remember sp 2524 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2525 __ andptr(rsp, -16); // align stack as required by ABI 2526 2527 // Save pending exception around call to VM (which contains an EXCEPTION_MARK) 2528 // NOTE that obj_reg == rbx currently 2529 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset()))); 2530 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2531 2532 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2533 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C))); 2534 __ mov(rsp, r12); // restore sp 2535 __ reinit_heapbase(); 2536 #ifdef ASSERT 2537 { 2538 Label L; 2539 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2540 __ jcc(Assembler::equal, L); 2541 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C"); 2542 __ bind(L); 2543 } 2544 #endif /* ASSERT */ 2545 2546 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx); 2547 2548 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2549 restore_native_result(masm, ret_type, stack_slots); 2550 } 2551 __ jmp(unlock_done); 2552 2553 // END Slow path unlock 2554 2555 } // synchronized 2556 2557 // SLOW PATH Reguard the stack if needed 2558 2559 __ bind(reguard); 2560 __ vzeroupper(); 2561 save_native_result(masm, ret_type, stack_slots); 2562 __ mov(r12, rsp); // remember sp 2563 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2564 __ andptr(rsp, -16); // align stack as required by ABI 2565 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 2566 __ mov(rsp, r12); // restore sp 2567 __ reinit_heapbase(); 2568 restore_native_result(masm, ret_type, stack_slots); 2569 // and continue 2570 __ jmp(reguard_done); 2571 2572 2573 2574 __ flush(); 2575 2576 nmethod *nm = nmethod::new_native_nmethod(method, 2577 compile_id, 2578 masm->code(), 2579 vep_offset, 2580 frame_complete, 2581 stack_slots / VMRegImpl::slots_per_word, 2582 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)), 2583 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size), 2584 oop_maps); 2585 2586 return nm; 2587 } 2588 2589 // this function returns the adjust size (in number of words) to a c2i adapter 2590 // activation for use during deoptimization 2591 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) { 2592 return (callee_locals - callee_parameters) * Interpreter::stackElementWords; 2593 } 2594 2595 2596 uint SharedRuntime::out_preserve_stack_slots() { 2597 return 0; 2598 } 2599 2600 2601 // Number of stack slots between incoming argument block and the start of 2602 // a new frame. The PROLOG must add this many slots to the stack. The 2603 // EPILOG must remove this many slots. amd64 needs two slots for 2604 // return address. 2605 uint SharedRuntime::in_preserve_stack_slots() { 2606 return 4 + 2 * VerifyStackAtCalls; 2607 } 2608 2609 //------------------------------generate_deopt_blob---------------------------- 2610 void SharedRuntime::generate_deopt_blob() { 2611 // Allocate space for the code 2612 ResourceMark rm; 2613 // Setup code generation tools 2614 int pad = 0; 2615 if (UseAVX > 2) { 2616 pad += 1024; 2617 } 2618 if (UseAPX) { 2619 pad += 1024; 2620 } 2621 #if INCLUDE_JVMCI 2622 if (EnableJVMCI) { 2623 pad += 512; // Increase the buffer size when compiling for JVMCI 2624 } 2625 #endif 2626 const char* name = SharedRuntime::stub_name(SharedStubId::deopt_id); 2627 CodeBuffer buffer(name, 2560+pad, 1024); 2628 MacroAssembler* masm = new MacroAssembler(&buffer); 2629 int frame_size_in_words; 2630 OopMap* map = nullptr; 2631 OopMapSet *oop_maps = new OopMapSet(); 2632 2633 // ------------- 2634 // This code enters when returning to a de-optimized nmethod. A return 2635 // address has been pushed on the stack, and return values are in 2636 // registers. 2637 // If we are doing a normal deopt then we were called from the patched 2638 // nmethod from the point we returned to the nmethod. So the return 2639 // address on the stack is wrong by NativeCall::instruction_size 2640 // We will adjust the value so it looks like we have the original return 2641 // address on the stack (like when we eagerly deoptimized). 2642 // In the case of an exception pending when deoptimizing, we enter 2643 // with a return address on the stack that points after the call we patched 2644 // into the exception handler. We have the following register state from, 2645 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp). 2646 // rax: exception oop 2647 // rbx: exception handler 2648 // rdx: throwing pc 2649 // So in this case we simply jam rdx into the useless return address and 2650 // the stack looks just like we want. 2651 // 2652 // At this point we need to de-opt. We save the argument return 2653 // registers. We call the first C routine, fetch_unroll_info(). This 2654 // routine captures the return values and returns a structure which 2655 // describes the current frame size and the sizes of all replacement frames. 2656 // The current frame is compiled code and may contain many inlined 2657 // functions, each with their own JVM state. We pop the current frame, then 2658 // push all the new frames. Then we call the C routine unpack_frames() to 2659 // populate these frames. Finally unpack_frames() returns us the new target 2660 // address. Notice that callee-save registers are BLOWN here; they have 2661 // already been captured in the vframeArray at the time the return PC was 2662 // patched. 2663 address start = __ pc(); 2664 Label cont; 2665 2666 // Prolog for non exception case! 2667 2668 // Save everything in sight. 2669 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2670 2671 // Normal deoptimization. Save exec mode for unpack_frames. 2672 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved 2673 __ jmp(cont); 2674 2675 int reexecute_offset = __ pc() - start; 2676 #if INCLUDE_JVMCI && !defined(COMPILER1) 2677 if (UseJVMCICompiler) { 2678 // JVMCI does not use this kind of deoptimization 2679 __ should_not_reach_here(); 2680 } 2681 #endif 2682 2683 // Reexecute case 2684 // return address is the pc describes what bci to do re-execute at 2685 2686 // No need to update map as each call to save_live_registers will produce identical oopmap 2687 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2688 2689 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved 2690 __ jmp(cont); 2691 2692 #if INCLUDE_JVMCI 2693 Label after_fetch_unroll_info_call; 2694 int implicit_exception_uncommon_trap_offset = 0; 2695 int uncommon_trap_offset = 0; 2696 2697 if (EnableJVMCI) { 2698 implicit_exception_uncommon_trap_offset = __ pc() - start; 2699 2700 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset()))); 2701 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD); 2702 2703 uncommon_trap_offset = __ pc() - start; 2704 2705 // Save everything in sight. 2706 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2707 // fetch_unroll_info needs to call last_java_frame() 2708 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2709 2710 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset()))); 2711 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1); 2712 2713 __ movl(r14, Deoptimization::Unpack_reexecute); 2714 __ mov(c_rarg0, r15_thread); 2715 __ movl(c_rarg2, r14); // exec mode 2716 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2717 oop_maps->add_gc_map( __ pc()-start, map->deep_copy()); 2718 2719 __ reset_last_Java_frame(false); 2720 2721 __ jmp(after_fetch_unroll_info_call); 2722 } // EnableJVMCI 2723 #endif // INCLUDE_JVMCI 2724 2725 int exception_offset = __ pc() - start; 2726 2727 // Prolog for exception case 2728 2729 // all registers are dead at this entry point, except for rax, and 2730 // rdx which contain the exception oop and exception pc 2731 // respectively. Set them in TLS and fall thru to the 2732 // unpack_with_exception_in_tls entry point. 2733 2734 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 2735 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax); 2736 2737 int exception_in_tls_offset = __ pc() - start; 2738 2739 // new implementation because exception oop is now passed in JavaThread 2740 2741 // Prolog for exception case 2742 // All registers must be preserved because they might be used by LinearScan 2743 // Exceptiop oop and throwing PC are passed in JavaThread 2744 // tos: stack at point of call to method that threw the exception (i.e. only 2745 // args are on the stack, no return address) 2746 2747 // make room on stack for the return address 2748 // It will be patched later with the throwing pc. The correct value is not 2749 // available now because loading it from memory would destroy registers. 2750 __ push(0); 2751 2752 // Save everything in sight. 2753 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2754 2755 // Now it is safe to overwrite any register 2756 2757 // Deopt during an exception. Save exec mode for unpack_frames. 2758 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved 2759 2760 // load throwing pc from JavaThread and patch it as the return address 2761 // of the current frame. Then clear the field in JavaThread 2762 2763 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2764 __ movptr(Address(rbp, wordSize), rdx); 2765 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2766 2767 #ifdef ASSERT 2768 // verify that there is really an exception oop in JavaThread 2769 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2770 __ verify_oop(rax); 2771 2772 // verify that there is no pending exception 2773 Label no_pending_exception; 2774 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 2775 __ testptr(rax, rax); 2776 __ jcc(Assembler::zero, no_pending_exception); 2777 __ stop("must not have pending exception here"); 2778 __ bind(no_pending_exception); 2779 #endif 2780 2781 __ bind(cont); 2782 2783 // Call C code. Need thread and this frame, but NOT official VM entry 2784 // crud. We cannot block on this call, no GC can happen. 2785 // 2786 // UnrollBlock* fetch_unroll_info(JavaThread* thread) 2787 2788 // fetch_unroll_info needs to call last_java_frame(). 2789 2790 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2791 #ifdef ASSERT 2792 { Label L; 2793 __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD); 2794 __ jcc(Assembler::equal, L); 2795 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared"); 2796 __ bind(L); 2797 } 2798 #endif // ASSERT 2799 __ mov(c_rarg0, r15_thread); 2800 __ movl(c_rarg1, r14); // exec_mode 2801 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info))); 2802 2803 // Need to have an oopmap that tells fetch_unroll_info where to 2804 // find any register it might need. 2805 oop_maps->add_gc_map(__ pc() - start, map); 2806 2807 __ reset_last_Java_frame(false); 2808 2809 #if INCLUDE_JVMCI 2810 if (EnableJVMCI) { 2811 __ bind(after_fetch_unroll_info_call); 2812 } 2813 #endif 2814 2815 // Load UnrollBlock* into rdi 2816 __ mov(rdi, rax); 2817 2818 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset())); 2819 Label noException; 2820 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending? 2821 __ jcc(Assembler::notEqual, noException); 2822 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2823 // QQQ this is useless it was null above 2824 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2825 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD); 2826 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2827 2828 __ verify_oop(rax); 2829 2830 // Overwrite the result registers with the exception results. 2831 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2832 // I think this is useless 2833 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx); 2834 2835 __ bind(noException); 2836 2837 // Only register save data is on the stack. 2838 // Now restore the result registers. Everything else is either dead 2839 // or captured in the vframeArray. 2840 RegisterSaver::restore_result_registers(masm); 2841 2842 // All of the register save area has been popped of the stack. Only the 2843 // return address remains. 2844 2845 // Pop all the frames we must move/replace. 2846 // 2847 // Frame picture (youngest to oldest) 2848 // 1: self-frame (no frame link) 2849 // 2: deopting frame (no frame link) 2850 // 3: caller of deopting frame (could be compiled/interpreted). 2851 // 2852 // Note: by leaving the return address of self-frame on the stack 2853 // and using the size of frame 2 to adjust the stack 2854 // when we are done the return to frame 3 will still be on the stack. 2855 2856 // Pop deoptimized frame 2857 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset())); 2858 __ addptr(rsp, rcx); 2859 2860 // rsp should be pointing at the return address to the caller (3) 2861 2862 // Pick up the initial fp we should save 2863 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2864 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset())); 2865 2866 #ifdef ASSERT 2867 // Compilers generate code that bang the stack by as much as the 2868 // interpreter would need. So this stack banging should never 2869 // trigger a fault. Verify that it does not on non product builds. 2870 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset())); 2871 __ bang_stack_size(rbx, rcx); 2872 #endif 2873 2874 // Load address of array of frame pcs into rcx 2875 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset())); 2876 2877 // Trash the old pc 2878 __ addptr(rsp, wordSize); 2879 2880 // Load address of array of frame sizes into rsi 2881 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset())); 2882 2883 // Load counter into rdx 2884 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset())); 2885 2886 // Now adjust the caller's stack to make up for the extra locals 2887 // but record the original sp so that we can save it in the skeletal interpreter 2888 // frame and the stack walking of interpreter_sender will get the unextended sp 2889 // value and not the "real" sp value. 2890 2891 const Register sender_sp = r8; 2892 2893 __ mov(sender_sp, rsp); 2894 __ movl(rbx, Address(rdi, 2895 Deoptimization::UnrollBlock:: 2896 caller_adjustment_offset())); 2897 __ subptr(rsp, rbx); 2898 2899 // Push interpreter frames in a loop 2900 Label loop; 2901 __ bind(loop); 2902 __ movptr(rbx, Address(rsi, 0)); // Load frame size 2903 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand 2904 __ pushptr(Address(rcx, 0)); // Save return address 2905 __ enter(); // Save old & set new ebp 2906 __ subptr(rsp, rbx); // Prolog 2907 // This value is corrected by layout_activation_impl 2908 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD); 2909 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable 2910 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 2911 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 2912 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 2913 __ decrementl(rdx); // Decrement counter 2914 __ jcc(Assembler::notZero, loop); 2915 __ pushptr(Address(rcx, 0)); // Save final return address 2916 2917 // Re-push self-frame 2918 __ enter(); // Save old & set new ebp 2919 2920 // Allocate a full sized register save area. 2921 // Return address and rbp are in place, so we allocate two less words. 2922 __ subptr(rsp, (frame_size_in_words - 2) * wordSize); 2923 2924 // Restore frame locals after moving the frame 2925 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0); 2926 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2927 2928 // Call C code. Need thread but NOT official VM entry 2929 // crud. We cannot block on this call, no GC can happen. Call should 2930 // restore return values to their stack-slots with the new SP. 2931 // 2932 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode) 2933 2934 // Use rbp because the frames look interpreted now 2935 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 2936 // Don't need the precise return PC here, just precise enough to point into this code blob. 2937 address the_pc = __ pc(); 2938 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1); 2939 2940 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI 2941 __ mov(c_rarg0, r15_thread); 2942 __ movl(c_rarg1, r14); // second arg: exec_mode 2943 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 2944 // Revert SP alignment after call since we're going to do some SP relative addressing below 2945 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset())); 2946 2947 // Set an oopmap for the call site 2948 // Use the same PC we used for the last java frame 2949 oop_maps->add_gc_map(the_pc - start, 2950 new OopMap( frame_size_in_words, 0 )); 2951 2952 // Clear fp AND pc 2953 __ reset_last_Java_frame(true); 2954 2955 // Collect return values 2956 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes())); 2957 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes())); 2958 // I think this is useless (throwing pc?) 2959 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes())); 2960 2961 // Pop self-frame. 2962 __ leave(); // Epilog 2963 2964 // Jump to interpreter 2965 __ ret(0); 2966 2967 // Make sure all code is generated 2968 masm->flush(); 2969 2970 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words); 2971 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset); 2972 #if INCLUDE_JVMCI 2973 if (EnableJVMCI) { 2974 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset); 2975 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset); 2976 } 2977 #endif 2978 } 2979 2980 //------------------------------generate_handler_blob------ 2981 // 2982 // Generate a special Compile2Runtime blob that saves all registers, 2983 // and setup oopmap. 2984 // 2985 SafepointBlob* SharedRuntime::generate_handler_blob(SharedStubId id, address call_ptr) { 2986 assert(StubRoutines::forward_exception_entry() != nullptr, 2987 "must be generated before"); 2988 assert(is_polling_page_id(id), "expected a polling page stub id"); 2989 2990 ResourceMark rm; 2991 OopMapSet *oop_maps = new OopMapSet(); 2992 OopMap* map; 2993 2994 // Allocate space for the code. Setup code generation tools. 2995 const char* name = SharedRuntime::stub_name(id); 2996 CodeBuffer buffer(name, 2348, 1024); 2997 MacroAssembler* masm = new MacroAssembler(&buffer); 2998 2999 address start = __ pc(); 3000 address call_pc = nullptr; 3001 int frame_size_in_words; 3002 bool cause_return = (id == SharedStubId::polling_page_return_handler_id); 3003 bool save_wide_vectors = (id == SharedStubId::polling_page_vectors_safepoint_handler_id); 3004 3005 // Make room for return address (or push it again) 3006 if (!cause_return) { 3007 __ push(rbx); 3008 } 3009 3010 // Save registers, fpu state, and flags 3011 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors); 3012 3013 // The following is basically a call_VM. However, we need the precise 3014 // address of the call in order to generate an oopmap. Hence, we do all the 3015 // work ourselves. 3016 3017 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next: 3018 3019 // The return address must always be correct so that frame constructor never 3020 // sees an invalid pc. 3021 3022 if (!cause_return) { 3023 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack. 3024 // Additionally, rbx is a callee saved register and we can look at it later to determine 3025 // if someone changed the return address for us! 3026 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset())); 3027 __ movptr(Address(rbp, wordSize), rbx); 3028 } 3029 3030 // Do the call 3031 __ mov(c_rarg0, r15_thread); 3032 __ call(RuntimeAddress(call_ptr)); 3033 3034 // Set an oopmap for the call site. This oopmap will map all 3035 // oop-registers and debug-info registers as callee-saved. This 3036 // will allow deoptimization at this safepoint to find all possible 3037 // debug-info recordings, as well as let GC find all oops. 3038 3039 oop_maps->add_gc_map( __ pc() - start, map); 3040 3041 Label noException; 3042 3043 __ reset_last_Java_frame(false); 3044 3045 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3046 __ jcc(Assembler::equal, noException); 3047 3048 // Exception pending 3049 3050 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3051 3052 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3053 3054 // No exception case 3055 __ bind(noException); 3056 3057 Label no_adjust; 3058 #ifdef ASSERT 3059 Label bail; 3060 #endif 3061 if (!cause_return) { 3062 Label no_prefix, not_special; 3063 3064 // If our stashed return pc was modified by the runtime we avoid touching it 3065 __ cmpptr(rbx, Address(rbp, wordSize)); 3066 __ jccb(Assembler::notEqual, no_adjust); 3067 3068 // Skip over the poll instruction. 3069 // See NativeInstruction::is_safepoint_poll() 3070 // Possible encodings: 3071 // 85 00 test %eax,(%rax) 3072 // 85 01 test %eax,(%rcx) 3073 // 85 02 test %eax,(%rdx) 3074 // 85 03 test %eax,(%rbx) 3075 // 85 06 test %eax,(%rsi) 3076 // 85 07 test %eax,(%rdi) 3077 // 3078 // 41 85 00 test %eax,(%r8) 3079 // 41 85 01 test %eax,(%r9) 3080 // 41 85 02 test %eax,(%r10) 3081 // 41 85 03 test %eax,(%r11) 3082 // 41 85 06 test %eax,(%r14) 3083 // 41 85 07 test %eax,(%r15) 3084 // 3085 // 85 04 24 test %eax,(%rsp) 3086 // 41 85 04 24 test %eax,(%r12) 3087 // 85 45 00 test %eax,0x0(%rbp) 3088 // 41 85 45 00 test %eax,0x0(%r13) 3089 3090 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix); 3091 __ jcc(Assembler::notEqual, no_prefix); 3092 __ addptr(rbx, 1); 3093 __ bind(no_prefix); 3094 #ifdef ASSERT 3095 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below 3096 #endif 3097 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values: 3098 // r12/rsp 0x04 3099 // r13/rbp 0x05 3100 __ movzbq(rcx, Address(rbx, 1)); 3101 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05 3102 __ subptr(rcx, 4); // looking for 0x00 .. 0x01 3103 __ cmpptr(rcx, 1); 3104 __ jcc(Assembler::above, not_special); 3105 __ addptr(rbx, 1); 3106 __ bind(not_special); 3107 #ifdef ASSERT 3108 // Verify the correct encoding of the poll we're about to skip. 3109 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl); 3110 __ jcc(Assembler::notEqual, bail); 3111 // Mask out the modrm bits 3112 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask); 3113 // rax encodes to 0, so if the bits are nonzero it's incorrect 3114 __ jcc(Assembler::notZero, bail); 3115 #endif 3116 // Adjust return pc forward to step over the safepoint poll instruction 3117 __ addptr(rbx, 2); 3118 __ movptr(Address(rbp, wordSize), rbx); 3119 } 3120 3121 __ bind(no_adjust); 3122 // Normal exit, restore registers and exit. 3123 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3124 __ ret(0); 3125 3126 #ifdef ASSERT 3127 __ bind(bail); 3128 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected"); 3129 #endif 3130 3131 // Make sure all code is generated 3132 masm->flush(); 3133 3134 // Fill-out other meta info 3135 return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words); 3136 } 3137 3138 // 3139 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss 3140 // 3141 // Generate a stub that calls into vm to find out the proper destination 3142 // of a java call. All the argument registers are live at this point 3143 // but since this is generic code we don't know what they are and the caller 3144 // must do any gc of the args. 3145 // 3146 RuntimeStub* SharedRuntime::generate_resolve_blob(SharedStubId id, address destination) { 3147 assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before"); 3148 assert(is_resolve_id(id), "expected a resolve stub id"); 3149 3150 // allocate space for the code 3151 ResourceMark rm; 3152 3153 const char* name = SharedRuntime::stub_name(id); 3154 CodeBuffer buffer(name, 1552, 512); 3155 MacroAssembler* masm = new MacroAssembler(&buffer); 3156 3157 int frame_size_in_words; 3158 3159 OopMapSet *oop_maps = new OopMapSet(); 3160 OopMap* map = nullptr; 3161 3162 int start = __ offset(); 3163 3164 // No need to save vector registers since they are caller-saved anyway. 3165 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false); 3166 3167 int frame_complete = __ offset(); 3168 3169 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 3170 3171 __ mov(c_rarg0, r15_thread); 3172 3173 __ call(RuntimeAddress(destination)); 3174 3175 3176 // Set an oopmap for the call site. 3177 // We need this not only for callee-saved registers, but also for volatile 3178 // registers that the compiler might be keeping live across a safepoint. 3179 3180 oop_maps->add_gc_map( __ offset() - start, map); 3181 3182 // rax contains the address we are going to jump to assuming no exception got installed 3183 3184 // clear last_Java_sp 3185 __ reset_last_Java_frame(false); 3186 // check for pending exceptions 3187 Label pending; 3188 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3189 __ jcc(Assembler::notEqual, pending); 3190 3191 // get the returned Method* 3192 __ get_vm_result_2(rbx, r15_thread); 3193 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx); 3194 3195 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 3196 3197 RegisterSaver::restore_live_registers(masm); 3198 3199 // We are back to the original state on entry and ready to go. 3200 3201 __ jmp(rax); 3202 3203 // Pending exception after the safepoint 3204 3205 __ bind(pending); 3206 3207 RegisterSaver::restore_live_registers(masm); 3208 3209 // exception pending => remove activation and forward to exception handler 3210 3211 __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD); 3212 3213 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 3214 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3215 3216 // ------------- 3217 // make sure all code is generated 3218 masm->flush(); 3219 3220 // return the blob 3221 // frame_size_words or bytes?? 3222 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); 3223 } 3224 3225 // Continuation point for throwing of implicit exceptions that are 3226 // not handled in the current activation. Fabricates an exception 3227 // oop and initiates normal exception dispatching in this 3228 // frame. Since we need to preserve callee-saved values (currently 3229 // only for C2, but done for C1 as well) we need a callee-saved oop 3230 // map and therefore have to make these stubs into RuntimeStubs 3231 // rather than BufferBlobs. If the compiler needs all registers to 3232 // be preserved between the fault point and the exception handler 3233 // then it must assume responsibility for that in 3234 // AbstractCompiler::continuation_for_implicit_null_exception or 3235 // continuation_for_implicit_division_by_zero_exception. All other 3236 // implicit exceptions (e.g., NullPointerException or 3237 // AbstractMethodError on entry) are either at call sites or 3238 // otherwise assume that stack unwinding will be initiated, so 3239 // caller saved registers were assumed volatile in the compiler. 3240 RuntimeStub* SharedRuntime::generate_throw_exception(SharedStubId id, address runtime_entry) { 3241 assert(is_throw_id(id), "expected a throw stub id"); 3242 3243 const char* name = SharedRuntime::stub_name(id); 3244 3245 // Information about frame layout at time of blocking runtime call. 3246 // Note that we only have to preserve callee-saved registers since 3247 // the compilers are responsible for supplying a continuation point 3248 // if they expect all registers to be preserved. 3249 enum layout { 3250 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt, 3251 rbp_off2, 3252 return_off, 3253 return_off2, 3254 framesize // inclusive of return address 3255 }; 3256 3257 int insts_size = 512; 3258 int locs_size = 64; 3259 3260 ResourceMark rm; 3261 const char* timer_msg = "SharedRuntime generate_throw_exception"; 3262 TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime)); 3263 3264 CodeBuffer code(name, insts_size, locs_size); 3265 OopMapSet* oop_maps = new OopMapSet(); 3266 MacroAssembler* masm = new MacroAssembler(&code); 3267 3268 address start = __ pc(); 3269 3270 // This is an inlined and slightly modified version of call_VM 3271 // which has the ability to fetch the return PC out of 3272 // thread-local storage and also sets up last_Java_sp slightly 3273 // differently than the real call_VM 3274 3275 __ enter(); // required for proper stackwalking of RuntimeStub frame 3276 3277 assert(is_even(framesize/2), "sp not 16-byte aligned"); 3278 3279 // return address and rbp are already in place 3280 __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog 3281 3282 int frame_complete = __ pc() - start; 3283 3284 // Set up last_Java_sp and last_Java_fp 3285 address the_pc = __ pc(); 3286 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 3287 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack 3288 3289 // Call runtime 3290 __ movptr(c_rarg0, r15_thread); 3291 BLOCK_COMMENT("call runtime_entry"); 3292 __ call(RuntimeAddress(runtime_entry)); 3293 3294 // Generate oop map 3295 OopMap* map = new OopMap(framesize, 0); 3296 3297 oop_maps->add_gc_map(the_pc - start, map); 3298 3299 __ reset_last_Java_frame(true); 3300 3301 __ leave(); // required for proper stackwalking of RuntimeStub frame 3302 3303 // check for pending exceptions 3304 #ifdef ASSERT 3305 Label L; 3306 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3307 __ jcc(Assembler::notEqual, L); 3308 __ should_not_reach_here(); 3309 __ bind(L); 3310 #endif // ASSERT 3311 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3312 3313 3314 // codeBlob framesize is in words (not VMRegImpl::slot_size) 3315 RuntimeStub* stub = 3316 RuntimeStub::new_runtime_stub(name, 3317 &code, 3318 frame_complete, 3319 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3320 oop_maps, false); 3321 return stub; 3322 } 3323 3324 //------------------------------Montgomery multiplication------------------------ 3325 // 3326 3327 #ifndef _WINDOWS 3328 3329 // Subtract 0:b from carry:a. Return carry. 3330 static julong 3331 sub(julong a[], julong b[], julong carry, long len) { 3332 long long i = 0, cnt = len; 3333 julong tmp; 3334 asm volatile("clc; " 3335 "0: ; " 3336 "mov (%[b], %[i], 8), %[tmp]; " 3337 "sbb %[tmp], (%[a], %[i], 8); " 3338 "inc %[i]; dec %[cnt]; " 3339 "jne 0b; " 3340 "mov %[carry], %[tmp]; sbb $0, %[tmp]; " 3341 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp) 3342 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry) 3343 : "memory"); 3344 return tmp; 3345 } 3346 3347 // Multiply (unsigned) Long A by Long B, accumulating the double- 3348 // length result into the accumulator formed of T0, T1, and T2. 3349 #define MACC(A, B, T0, T1, T2) \ 3350 do { \ 3351 unsigned long hi, lo; \ 3352 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3353 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3354 : "r"(A), "a"(B) : "cc"); \ 3355 } while(0) 3356 3357 // As above, but add twice the double-length result into the 3358 // accumulator. 3359 #define MACC2(A, B, T0, T1, T2) \ 3360 do { \ 3361 unsigned long hi, lo; \ 3362 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \ 3363 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3364 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3365 : "r"(A), "a"(B) : "cc"); \ 3366 } while(0) 3367 3368 #else //_WINDOWS 3369 3370 static julong 3371 sub(julong a[], julong b[], julong carry, long len) { 3372 long i; 3373 julong tmp; 3374 unsigned char c = 1; 3375 for (i = 0; i < len; i++) { 3376 c = _addcarry_u64(c, a[i], ~b[i], &tmp); 3377 a[i] = tmp; 3378 } 3379 c = _addcarry_u64(c, carry, ~0, &tmp); 3380 return tmp; 3381 } 3382 3383 // Multiply (unsigned) Long A by Long B, accumulating the double- 3384 // length result into the accumulator formed of T0, T1, and T2. 3385 #define MACC(A, B, T0, T1, T2) \ 3386 do { \ 3387 julong hi, lo; \ 3388 lo = _umul128(A, B, &hi); \ 3389 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3390 c = _addcarry_u64(c, hi, T1, &T1); \ 3391 _addcarry_u64(c, T2, 0, &T2); \ 3392 } while(0) 3393 3394 // As above, but add twice the double-length result into the 3395 // accumulator. 3396 #define MACC2(A, B, T0, T1, T2) \ 3397 do { \ 3398 julong hi, lo; \ 3399 lo = _umul128(A, B, &hi); \ 3400 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3401 c = _addcarry_u64(c, hi, T1, &T1); \ 3402 _addcarry_u64(c, T2, 0, &T2); \ 3403 c = _addcarry_u64(0, lo, T0, &T0); \ 3404 c = _addcarry_u64(c, hi, T1, &T1); \ 3405 _addcarry_u64(c, T2, 0, &T2); \ 3406 } while(0) 3407 3408 #endif //_WINDOWS 3409 3410 // Fast Montgomery multiplication. The derivation of the algorithm is 3411 // in A Cryptographic Library for the Motorola DSP56000, 3412 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3413 3414 static void NOINLINE 3415 montgomery_multiply(julong a[], julong b[], julong n[], 3416 julong m[], julong inv, int len) { 3417 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3418 int i; 3419 3420 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply"); 3421 3422 for (i = 0; i < len; i++) { 3423 int j; 3424 for (j = 0; j < i; j++) { 3425 MACC(a[j], b[i-j], t0, t1, t2); 3426 MACC(m[j], n[i-j], t0, t1, t2); 3427 } 3428 MACC(a[i], b[0], t0, t1, t2); 3429 m[i] = t0 * inv; 3430 MACC(m[i], n[0], t0, t1, t2); 3431 3432 assert(t0 == 0, "broken Montgomery multiply"); 3433 3434 t0 = t1; t1 = t2; t2 = 0; 3435 } 3436 3437 for (i = len; i < 2*len; i++) { 3438 int j; 3439 for (j = i-len+1; j < len; j++) { 3440 MACC(a[j], b[i-j], t0, t1, t2); 3441 MACC(m[j], n[i-j], t0, t1, t2); 3442 } 3443 m[i-len] = t0; 3444 t0 = t1; t1 = t2; t2 = 0; 3445 } 3446 3447 while (t0) 3448 t0 = sub(m, n, t0, len); 3449 } 3450 3451 // Fast Montgomery squaring. This uses asymptotically 25% fewer 3452 // multiplies so it should be up to 25% faster than Montgomery 3453 // multiplication. However, its loop control is more complex and it 3454 // may actually run slower on some machines. 3455 3456 static void NOINLINE 3457 montgomery_square(julong a[], julong n[], 3458 julong m[], julong inv, int len) { 3459 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3460 int i; 3461 3462 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square"); 3463 3464 for (i = 0; i < len; i++) { 3465 int j; 3466 int end = (i+1)/2; 3467 for (j = 0; j < end; j++) { 3468 MACC2(a[j], a[i-j], t0, t1, t2); 3469 MACC(m[j], n[i-j], t0, t1, t2); 3470 } 3471 if ((i & 1) == 0) { 3472 MACC(a[j], a[j], t0, t1, t2); 3473 } 3474 for (; j < i; j++) { 3475 MACC(m[j], n[i-j], t0, t1, t2); 3476 } 3477 m[i] = t0 * inv; 3478 MACC(m[i], n[0], t0, t1, t2); 3479 3480 assert(t0 == 0, "broken Montgomery square"); 3481 3482 t0 = t1; t1 = t2; t2 = 0; 3483 } 3484 3485 for (i = len; i < 2*len; i++) { 3486 int start = i-len+1; 3487 int end = start + (len - start)/2; 3488 int j; 3489 for (j = start; j < end; j++) { 3490 MACC2(a[j], a[i-j], t0, t1, t2); 3491 MACC(m[j], n[i-j], t0, t1, t2); 3492 } 3493 if ((i & 1) == 0) { 3494 MACC(a[j], a[j], t0, t1, t2); 3495 } 3496 for (; j < len; j++) { 3497 MACC(m[j], n[i-j], t0, t1, t2); 3498 } 3499 m[i-len] = t0; 3500 t0 = t1; t1 = t2; t2 = 0; 3501 } 3502 3503 while (t0) 3504 t0 = sub(m, n, t0, len); 3505 } 3506 3507 // Swap words in a longword. 3508 static julong swap(julong x) { 3509 return (x << 32) | (x >> 32); 3510 } 3511 3512 // Copy len longwords from s to d, word-swapping as we go. The 3513 // destination array is reversed. 3514 static void reverse_words(julong *s, julong *d, int len) { 3515 d += len; 3516 while(len-- > 0) { 3517 d--; 3518 *d = swap(*s); 3519 s++; 3520 } 3521 } 3522 3523 // The threshold at which squaring is advantageous was determined 3524 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. 3525 #define MONTGOMERY_SQUARING_THRESHOLD 64 3526 3527 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, 3528 jint len, jlong inv, 3529 jint *m_ints) { 3530 assert(len % 2 == 0, "array length in montgomery_multiply must be even"); 3531 int longwords = len/2; 3532 3533 // Make very sure we don't use so much space that the stack might 3534 // overflow. 512 jints corresponds to an 16384-bit integer and 3535 // will use here a total of 8k bytes of stack space. 3536 int divisor = sizeof(julong) * 4; 3537 guarantee(longwords <= 8192 / divisor, "must be"); 3538 int total_allocation = longwords * sizeof (julong) * 4; 3539 julong *scratch = (julong *)alloca(total_allocation); 3540 3541 // Local scratch arrays 3542 julong 3543 *a = scratch + 0 * longwords, 3544 *b = scratch + 1 * longwords, 3545 *n = scratch + 2 * longwords, 3546 *m = scratch + 3 * longwords; 3547 3548 reverse_words((julong *)a_ints, a, longwords); 3549 reverse_words((julong *)b_ints, b, longwords); 3550 reverse_words((julong *)n_ints, n, longwords); 3551 3552 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords); 3553 3554 reverse_words(m, (julong *)m_ints, longwords); 3555 } 3556 3557 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, 3558 jint len, jlong inv, 3559 jint *m_ints) { 3560 assert(len % 2 == 0, "array length in montgomery_square must be even"); 3561 int longwords = len/2; 3562 3563 // Make very sure we don't use so much space that the stack might 3564 // overflow. 512 jints corresponds to an 16384-bit integer and 3565 // will use here a total of 6k bytes of stack space. 3566 int divisor = sizeof(julong) * 3; 3567 guarantee(longwords <= (8192 / divisor), "must be"); 3568 int total_allocation = longwords * sizeof (julong) * 3; 3569 julong *scratch = (julong *)alloca(total_allocation); 3570 3571 // Local scratch arrays 3572 julong 3573 *a = scratch + 0 * longwords, 3574 *n = scratch + 1 * longwords, 3575 *m = scratch + 2 * longwords; 3576 3577 reverse_words((julong *)a_ints, a, longwords); 3578 reverse_words((julong *)n_ints, n, longwords); 3579 3580 if (len >= MONTGOMERY_SQUARING_THRESHOLD) { 3581 ::montgomery_square(a, n, m, (julong)inv, longwords); 3582 } else { 3583 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords); 3584 } 3585 3586 reverse_words(m, (julong *)m_ints, longwords); 3587 } 3588 3589 #if INCLUDE_JFR 3590 3591 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint. 3592 // It returns a jobject handle to the event writer. 3593 // The handle is dereferenced and the return value is the event writer oop. 3594 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() { 3595 enum layout { 3596 rbp_off, 3597 rbpH_off, 3598 return_off, 3599 return_off2, 3600 framesize // inclusive of return address 3601 }; 3602 3603 const char* name = SharedRuntime::stub_name(SharedStubId::jfr_write_checkpoint_id); 3604 CodeBuffer code(name, 1024, 64); 3605 MacroAssembler* masm = new MacroAssembler(&code); 3606 address start = __ pc(); 3607 3608 __ enter(); 3609 address the_pc = __ pc(); 3610 3611 int frame_complete = the_pc - start; 3612 3613 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 3614 __ movptr(c_rarg0, r15_thread); 3615 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1); 3616 __ reset_last_Java_frame(true); 3617 3618 // rax is jobject handle result, unpack and process it through a barrier. 3619 __ resolve_global_jobject(rax, r15_thread, c_rarg0); 3620 3621 __ leave(); 3622 __ ret(0); 3623 3624 OopMapSet* oop_maps = new OopMapSet(); 3625 OopMap* map = new OopMap(framesize, 1); 3626 oop_maps->add_gc_map(frame_complete, map); 3627 3628 RuntimeStub* stub = 3629 RuntimeStub::new_runtime_stub(name, 3630 &code, 3631 frame_complete, 3632 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3633 oop_maps, 3634 false); 3635 return stub; 3636 } 3637 3638 // For c2: call to return a leased buffer. 3639 RuntimeStub* SharedRuntime::generate_jfr_return_lease() { 3640 enum layout { 3641 rbp_off, 3642 rbpH_off, 3643 return_off, 3644 return_off2, 3645 framesize // inclusive of return address 3646 }; 3647 3648 const char* name = SharedRuntime::stub_name(SharedStubId::jfr_return_lease_id); 3649 CodeBuffer code(name, 1024, 64); 3650 MacroAssembler* masm = new MacroAssembler(&code); 3651 address start = __ pc(); 3652 3653 __ enter(); 3654 address the_pc = __ pc(); 3655 3656 int frame_complete = the_pc - start; 3657 3658 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2); 3659 __ movptr(c_rarg0, r15_thread); 3660 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1); 3661 __ reset_last_Java_frame(true); 3662 3663 __ leave(); 3664 __ ret(0); 3665 3666 OopMapSet* oop_maps = new OopMapSet(); 3667 OopMap* map = new OopMap(framesize, 1); 3668 oop_maps->add_gc_map(frame_complete, map); 3669 3670 RuntimeStub* stub = 3671 RuntimeStub::new_runtime_stub(name, 3672 &code, 3673 frame_complete, 3674 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3675 oop_maps, 3676 false); 3677 return stub; 3678 } 3679 3680 #endif // INCLUDE_JFR 3681