1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #ifndef _WINDOWS 26 #include "alloca.h" 27 #endif 28 #include "asm/macroAssembler.hpp" 29 #include "asm/macroAssembler.inline.hpp" 30 #include "code/aotCodeCache.hpp" 31 #include "code/compiledIC.hpp" 32 #include "code/debugInfoRec.hpp" 33 #include "code/nativeInst.hpp" 34 #include "code/vtableStubs.hpp" 35 #include "compiler/oopMap.hpp" 36 #include "gc/shared/collectedHeap.hpp" 37 #include "gc/shared/gcLocker.hpp" 38 #include "gc/shared/barrierSet.hpp" 39 #include "gc/shared/barrierSetAssembler.hpp" 40 #include "interpreter/interpreter.hpp" 41 #include "logging/log.hpp" 42 #include "memory/resourceArea.hpp" 43 #include "memory/universe.hpp" 44 #include "oops/klass.inline.hpp" 45 #include "oops/method.inline.hpp" 46 #include "prims/methodHandles.hpp" 47 #include "runtime/continuation.hpp" 48 #include "runtime/continuationEntry.inline.hpp" 49 #include "runtime/globals.hpp" 50 #include "runtime/jniHandles.hpp" 51 #include "runtime/safepointMechanism.hpp" 52 #include "runtime/sharedRuntime.hpp" 53 #include "runtime/signature.hpp" 54 #include "runtime/stubRoutines.hpp" 55 #include "runtime/timerTrace.hpp" 56 #include "runtime/vframeArray.hpp" 57 #include "runtime/vm_version.hpp" 58 #include "utilities/align.hpp" 59 #include "utilities/checkedCast.hpp" 60 #include "utilities/formatBuffer.hpp" 61 #include "vmreg_x86.inline.hpp" 62 #ifdef COMPILER1 63 #include "c1/c1_Runtime1.hpp" 64 #endif 65 #ifdef COMPILER2 66 #include "opto/runtime.hpp" 67 #endif 68 #if INCLUDE_JVMCI 69 #include "jvmci/jvmciJavaClasses.hpp" 70 #endif 71 72 #define __ masm-> 73 74 #ifdef PRODUCT 75 #define BLOCK_COMMENT(str) /* nothing */ 76 #else 77 #define BLOCK_COMMENT(str) __ block_comment(str) 78 #endif // PRODUCT 79 80 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size; 81 82 class RegisterSaver { 83 // Capture info about frame layout. Layout offsets are in jint 84 // units because compiler frame slots are jints. 85 #define XSAVE_AREA_BEGIN 160 86 #define XSAVE_AREA_YMM_BEGIN 576 87 #define XSAVE_AREA_EGPRS 960 88 #define XSAVE_AREA_OPMASK_BEGIN 1088 89 #define XSAVE_AREA_ZMM_BEGIN 1152 90 #define XSAVE_AREA_UPPERBANK 1664 91 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off 92 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off 93 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off 94 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off 95 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off 96 enum layout { 97 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area 98 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area 99 DEF_XMM_OFFS(0), 100 DEF_XMM_OFFS(1), 101 // 2..15 are implied in range usage 102 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 103 DEF_YMM_OFFS(0), 104 DEF_YMM_OFFS(1), 105 r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt, 106 r16H_off, 107 r17_off, r17H_off, 108 r18_off, r18H_off, 109 r19_off, r19H_off, 110 r20_off, r20H_off, 111 r21_off, r21H_off, 112 r22_off, r22H_off, 113 r23_off, r23H_off, 114 r24_off, r24H_off, 115 r25_off, r25H_off, 116 r26_off, r26H_off, 117 r27_off, r27H_off, 118 r28_off, r28H_off, 119 r29_off, r29H_off, 120 r30_off, r30H_off, 121 r31_off, r31H_off, 122 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 123 DEF_OPMASK_OFFS(0), 124 DEF_OPMASK_OFFS(1), 125 // 2..7 are implied in range usage 126 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 127 DEF_ZMM_OFFS(0), 128 DEF_ZMM_OFFS(1), 129 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt, 130 DEF_ZMM_UPPER_OFFS(16), 131 DEF_ZMM_UPPER_OFFS(17), 132 // 18..31 are implied in range usage 133 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt), 134 fpu_stateH_end, 135 r15_off, r15H_off, 136 r14_off, r14H_off, 137 r13_off, r13H_off, 138 r12_off, r12H_off, 139 r11_off, r11H_off, 140 r10_off, r10H_off, 141 r9_off, r9H_off, 142 r8_off, r8H_off, 143 rdi_off, rdiH_off, 144 rsi_off, rsiH_off, 145 ignore_off, ignoreH_off, // extra copy of rbp 146 rsp_off, rspH_off, 147 rbx_off, rbxH_off, 148 rdx_off, rdxH_off, 149 rcx_off, rcxH_off, 150 rax_off, raxH_off, 151 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state 152 align_off, alignH_off, 153 flags_off, flagsH_off, 154 // The frame sender code expects that rbp will be in the "natural" place and 155 // will override any oopMap setting for it. We must therefore force the layout 156 // so that it agrees with the frame sender code. 157 rbp_off, rbpH_off, // copy of rbp we will restore 158 return_off, returnH_off, // slot for return address 159 reg_save_size // size in compiler stack slots 160 }; 161 162 public: 163 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors); 164 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false); 165 166 // Offsets into the register save area 167 // Used by deoptimization when it is managing result register 168 // values on its own 169 170 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; } 171 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; } 172 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; } 173 static int r15_offset_in_bytes(void) { return BytesPerInt * r15_off; } 174 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; } 175 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; } 176 177 // During deoptimization only the result registers need to be restored, 178 // all the other values have already been extracted. 179 static void restore_result_registers(MacroAssembler* masm); 180 }; 181 182 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) { 183 int off = 0; 184 int num_xmm_regs = XMMRegister::available_xmm_registers(); 185 #if COMPILER2_OR_JVMCI 186 if (save_wide_vectors && UseAVX == 0) { 187 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX 188 } 189 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 190 #else 191 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI 192 #endif 193 194 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated 195 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs); 196 // OopMap frame size is in compiler stack slots (jint's) not bytes or words 197 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 198 // CodeBlob frame size is in words. 199 int frame_size_in_words = frame_size_in_bytes / wordSize; 200 *total_frame_words = frame_size_in_words; 201 202 // Save registers, fpu state, and flags. 203 // We assume caller has already pushed the return address onto the 204 // stack, so rsp is 8-byte aligned here. 205 // We push rpb twice in this sequence because we want the real rbp 206 // to be under the return like a normal enter. 207 208 __ enter(); // rsp becomes 16-byte aligned here 209 __ pushf(); 210 // Make sure rsp stays 16-byte aligned 211 __ subq(rsp, 8); 212 // Push CPU state in multiple of 16 bytes 213 __ save_legacy_gprs(); 214 __ push_FPU_state(); 215 216 217 // push cpu state handles this on EVEX enabled targets 218 if (save_wide_vectors) { 219 // Save upper half of YMM registers(0..15) 220 int base_addr = XSAVE_AREA_YMM_BEGIN; 221 for (int n = 0; n < 16; n++) { 222 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n)); 223 } 224 if (VM_Version::supports_evex()) { 225 // Save upper half of ZMM registers(0..15) 226 base_addr = XSAVE_AREA_ZMM_BEGIN; 227 for (int n = 0; n < 16; n++) { 228 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n)); 229 } 230 // Save full ZMM registers(16..num_xmm_regs) 231 base_addr = XSAVE_AREA_UPPERBANK; 232 off = 0; 233 int vector_len = Assembler::AVX_512bit; 234 for (int n = 16; n < num_xmm_regs; n++) { 235 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 236 } 237 #if COMPILER2_OR_JVMCI 238 base_addr = XSAVE_AREA_OPMASK_BEGIN; 239 off = 0; 240 for(int n = 0; n < KRegister::number_of_registers; n++) { 241 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 242 } 243 #endif 244 } 245 } else { 246 if (VM_Version::supports_evex()) { 247 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 248 int base_addr = XSAVE_AREA_UPPERBANK; 249 off = 0; 250 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 251 for (int n = 16; n < num_xmm_regs; n++) { 252 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 253 } 254 #if COMPILER2_OR_JVMCI 255 base_addr = XSAVE_AREA_OPMASK_BEGIN; 256 off = 0; 257 for(int n = 0; n < KRegister::number_of_registers; n++) { 258 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 259 } 260 #endif 261 } 262 } 263 264 #if COMPILER2_OR_JVMCI 265 if (UseAPX) { 266 int base_addr = XSAVE_AREA_EGPRS; 267 off = 0; 268 for (int n = 16; n < Register::number_of_registers; n++) { 269 __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n)); 270 } 271 } 272 #endif 273 274 __ vzeroupper(); 275 if (frame::arg_reg_save_area_bytes != 0) { 276 // Allocate argument register save area 277 __ subptr(rsp, frame::arg_reg_save_area_bytes); 278 } 279 280 // Set an oopmap for the call site. This oopmap will map all 281 // oop-registers and debug-info registers as callee-saved. This 282 // will allow deoptimization at this safepoint to find all possible 283 // debug-info recordings, as well as let GC find all oops. 284 285 OopMapSet *oop_maps = new OopMapSet(); 286 OopMap* map = new OopMap(frame_size_in_slots, 0); 287 288 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x)) 289 290 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg()); 291 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg()); 292 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg()); 293 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg()); 294 // rbp location is known implicitly by the frame sender code, needs no oopmap 295 // and the location where rbp was saved by is ignored 296 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg()); 297 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg()); 298 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg()); 299 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg()); 300 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg()); 301 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg()); 302 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg()); 303 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg()); 304 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg()); 305 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg()); 306 307 if (UseAPX) { 308 map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg()); 309 map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg()); 310 map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg()); 311 map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg()); 312 map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg()); 313 map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg()); 314 map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg()); 315 map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg()); 316 map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg()); 317 map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg()); 318 map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg()); 319 map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg()); 320 map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg()); 321 map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg()); 322 map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg()); 323 map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg()); 324 } 325 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 326 // on EVEX enabled targets, we get it included in the xsave area 327 off = xmm0_off; 328 int delta = xmm1_off - off; 329 for (int n = 0; n < 16; n++) { 330 XMMRegister xmm_name = as_XMMRegister(n); 331 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()); 332 off += delta; 333 } 334 if (UseAVX > 2) { 335 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 336 off = zmm16_off; 337 delta = zmm17_off - off; 338 for (int n = 16; n < num_xmm_regs; n++) { 339 XMMRegister zmm_name = as_XMMRegister(n); 340 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()); 341 off += delta; 342 } 343 } 344 345 #if COMPILER2_OR_JVMCI 346 if (save_wide_vectors) { 347 // Save upper half of YMM registers(0..15) 348 off = ymm0_off; 349 delta = ymm1_off - ymm0_off; 350 for (int n = 0; n < 16; n++) { 351 XMMRegister ymm_name = as_XMMRegister(n); 352 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4)); 353 off += delta; 354 } 355 if (VM_Version::supports_evex()) { 356 // Save upper half of ZMM registers(0..15) 357 off = zmm0_off; 358 delta = zmm1_off - zmm0_off; 359 for (int n = 0; n < 16; n++) { 360 XMMRegister zmm_name = as_XMMRegister(n); 361 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8)); 362 off += delta; 363 } 364 } 365 } 366 #endif // COMPILER2_OR_JVMCI 367 368 // %%% These should all be a waste but we'll keep things as they were for now 369 if (true) { 370 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next()); 371 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next()); 372 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next()); 373 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next()); 374 // rbp location is known implicitly by the frame sender code, needs no oopmap 375 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next()); 376 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next()); 377 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next()); 378 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next()); 379 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next()); 380 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next()); 381 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next()); 382 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next()); 383 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next()); 384 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next()); 385 if (UseAPX) { 386 map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next()); 387 map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next()); 388 map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next()); 389 map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next()); 390 map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next()); 391 map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next()); 392 map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next()); 393 map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next()); 394 map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next()); 395 map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next()); 396 map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next()); 397 map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next()); 398 map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next()); 399 map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next()); 400 map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next()); 401 map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next()); 402 } 403 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 404 // on EVEX enabled targets, we get it included in the xsave area 405 off = xmm0H_off; 406 delta = xmm1H_off - off; 407 for (int n = 0; n < 16; n++) { 408 XMMRegister xmm_name = as_XMMRegister(n); 409 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next()); 410 off += delta; 411 } 412 if (UseAVX > 2) { 413 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 414 off = zmm16H_off; 415 delta = zmm17H_off - off; 416 for (int n = 16; n < num_xmm_regs; n++) { 417 XMMRegister zmm_name = as_XMMRegister(n); 418 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next()); 419 off += delta; 420 } 421 } 422 } 423 424 return map; 425 } 426 427 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) { 428 int num_xmm_regs = XMMRegister::available_xmm_registers(); 429 if (frame::arg_reg_save_area_bytes != 0) { 430 // Pop arg register save area 431 __ addptr(rsp, frame::arg_reg_save_area_bytes); 432 } 433 434 #if COMPILER2_OR_JVMCI 435 if (restore_wide_vectors) { 436 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); 437 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 438 } 439 #else 440 assert(!restore_wide_vectors, "vectors are generated only by C2"); 441 #endif 442 443 __ vzeroupper(); 444 445 // On EVEX enabled targets everything is handled in pop fpu state 446 if (restore_wide_vectors) { 447 // Restore upper half of YMM registers (0..15) 448 int base_addr = XSAVE_AREA_YMM_BEGIN; 449 for (int n = 0; n < 16; n++) { 450 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16)); 451 } 452 if (VM_Version::supports_evex()) { 453 // Restore upper half of ZMM registers (0..15) 454 base_addr = XSAVE_AREA_ZMM_BEGIN; 455 for (int n = 0; n < 16; n++) { 456 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32)); 457 } 458 // Restore full ZMM registers(16..num_xmm_regs) 459 base_addr = XSAVE_AREA_UPPERBANK; 460 int vector_len = Assembler::AVX_512bit; 461 int off = 0; 462 for (int n = 16; n < num_xmm_regs; n++) { 463 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 464 } 465 #if COMPILER2_OR_JVMCI 466 base_addr = XSAVE_AREA_OPMASK_BEGIN; 467 off = 0; 468 for (int n = 0; n < KRegister::number_of_registers; n++) { 469 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 470 } 471 #endif 472 } 473 } else { 474 if (VM_Version::supports_evex()) { 475 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 476 int base_addr = XSAVE_AREA_UPPERBANK; 477 int off = 0; 478 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 479 for (int n = 16; n < num_xmm_regs; n++) { 480 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 481 } 482 #if COMPILER2_OR_JVMCI 483 base_addr = XSAVE_AREA_OPMASK_BEGIN; 484 off = 0; 485 for (int n = 0; n < KRegister::number_of_registers; n++) { 486 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 487 } 488 #endif 489 } 490 } 491 492 #if COMPILER2_OR_JVMCI 493 if (UseAPX) { 494 int base_addr = XSAVE_AREA_EGPRS; 495 int off = 0; 496 for (int n = 16; n < Register::number_of_registers; n++) { 497 __ movq(as_Register(n), Address(rsp, base_addr+(off++*8))); 498 } 499 } 500 #endif 501 502 // Recover CPU state 503 __ pop_FPU_state(); 504 __ restore_legacy_gprs(); 505 __ addq(rsp, 8); 506 __ popf(); 507 // Get the rbp described implicitly by the calling convention (no oopMap) 508 __ pop(rbp); 509 } 510 511 void RegisterSaver::restore_result_registers(MacroAssembler* masm) { 512 513 // Just restore result register. Only used by deoptimization. By 514 // now any callee save register that needs to be restored to a c2 515 // caller of the deoptee has been extracted into the vframeArray 516 // and will be stuffed into the c2i adapter we create for later 517 // restoration so only result registers need to be restored here. 518 519 // Restore fp result register 520 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes())); 521 // Restore integer result register 522 __ movptr(rax, Address(rsp, rax_offset_in_bytes())); 523 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes())); 524 525 // Pop all of the register save are off the stack except the return address 526 __ addptr(rsp, return_offset_in_bytes()); 527 } 528 529 // Is vector's size (in bytes) bigger than a size saved by default? 530 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions. 531 bool SharedRuntime::is_wide_vector(int size) { 532 return size > 16; 533 } 534 535 // --------------------------------------------------------------------------- 536 // Read the array of BasicTypes from a signature, and compute where the 537 // arguments should go. Values in the VMRegPair regs array refer to 4-byte 538 // quantities. Values less than VMRegImpl::stack0 are registers, those above 539 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer 540 // as framesizes are fixed. 541 // VMRegImpl::stack0 refers to the first slot 0(sp). 542 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher. 543 // Register up to Register::number_of_registers are the 64-bit 544 // integer registers. 545 546 // Note: the INPUTS in sig_bt are in units of Java argument words, which are 547 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit 548 // units regardless of build. Of course for i486 there is no 64 bit build 549 550 // The Java calling convention is a "shifted" version of the C ABI. 551 // By skipping the first C ABI register we can call non-static jni methods 552 // with small numbers of arguments without having to shuffle the arguments 553 // at all. Since we control the java ABI we ought to at least get some 554 // advantage out of it. 555 556 int SharedRuntime::java_calling_convention(const BasicType *sig_bt, 557 VMRegPair *regs, 558 int total_args_passed) { 559 560 // Create the mapping between argument positions and 561 // registers. 562 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = { 563 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5 564 }; 565 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = { 566 j_farg0, j_farg1, j_farg2, j_farg3, 567 j_farg4, j_farg5, j_farg6, j_farg7 568 }; 569 570 571 uint int_args = 0; 572 uint fp_args = 0; 573 uint stk_args = 0; 574 575 for (int i = 0; i < total_args_passed; i++) { 576 switch (sig_bt[i]) { 577 case T_BOOLEAN: 578 case T_CHAR: 579 case T_BYTE: 580 case T_SHORT: 581 case T_INT: 582 if (int_args < Argument::n_int_register_parameters_j) { 583 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 584 } else { 585 stk_args = align_up(stk_args, 2); 586 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 587 stk_args += 1; 588 } 589 break; 590 case T_VOID: 591 // halves of T_LONG or T_DOUBLE 592 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 593 regs[i].set_bad(); 594 break; 595 case T_LONG: 596 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 597 // fall through 598 case T_OBJECT: 599 case T_ARRAY: 600 case T_ADDRESS: 601 if (int_args < Argument::n_int_register_parameters_j) { 602 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 603 } else { 604 stk_args = align_up(stk_args, 2); 605 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 606 stk_args += 2; 607 } 608 break; 609 case T_FLOAT: 610 if (fp_args < Argument::n_float_register_parameters_j) { 611 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 612 } else { 613 stk_args = align_up(stk_args, 2); 614 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 615 stk_args += 1; 616 } 617 break; 618 case T_DOUBLE: 619 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 620 if (fp_args < Argument::n_float_register_parameters_j) { 621 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 622 } else { 623 stk_args = align_up(stk_args, 2); 624 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 625 stk_args += 2; 626 } 627 break; 628 default: 629 ShouldNotReachHere(); 630 break; 631 } 632 } 633 634 return stk_args; 635 } 636 637 // Patch the callers callsite with entry to compiled code if it exists. 638 static void patch_callers_callsite(MacroAssembler *masm) { 639 Label L; 640 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 641 __ jcc(Assembler::equal, L); 642 643 // Save the current stack pointer 644 __ mov(r13, rsp); 645 // Schedule the branch target address early. 646 // Call into the VM to patch the caller, then jump to compiled callee 647 // rax isn't live so capture return address while we easily can 648 __ movptr(rax, Address(rsp, 0)); 649 650 // align stack so push_CPU_state doesn't fault 651 __ andptr(rsp, -(StackAlignmentInBytes)); 652 __ push_CPU_state(); 653 __ vzeroupper(); 654 // VM needs caller's callsite 655 // VM needs target method 656 // This needs to be a long call since we will relocate this adapter to 657 // the codeBuffer and it may not reach 658 659 // Allocate argument register save area 660 if (frame::arg_reg_save_area_bytes != 0) { 661 __ subptr(rsp, frame::arg_reg_save_area_bytes); 662 } 663 __ mov(c_rarg0, rbx); 664 __ mov(c_rarg1, rax); 665 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite))); 666 667 // De-allocate argument register save area 668 if (frame::arg_reg_save_area_bytes != 0) { 669 __ addptr(rsp, frame::arg_reg_save_area_bytes); 670 } 671 672 __ vzeroupper(); 673 __ pop_CPU_state(); 674 // restore sp 675 __ mov(rsp, r13); 676 __ bind(L); 677 } 678 679 static void gen_c2i_adapter(MacroAssembler *masm, 680 int total_args_passed, 681 int comp_args_on_stack, 682 const BasicType *sig_bt, 683 const VMRegPair *regs, 684 Label& skip_fixup) { 685 // Before we get into the guts of the C2I adapter, see if we should be here 686 // at all. We've come from compiled code and are attempting to jump to the 687 // interpreter, which means the caller made a static call to get here 688 // (vcalls always get a compiled target if there is one). Check for a 689 // compiled target. If there is one, we need to patch the caller's call. 690 patch_callers_callsite(masm); 691 692 __ bind(skip_fixup); 693 694 // Since all args are passed on the stack, total_args_passed * 695 // Interpreter::stackElementSize is the space we need. 696 697 assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed); 698 699 int extraspace = (total_args_passed * Interpreter::stackElementSize); 700 701 // stack is aligned, keep it that way 702 // This is not currently needed or enforced by the interpreter, but 703 // we might as well conform to the ABI. 704 extraspace = align_up(extraspace, 2*wordSize); 705 706 // set senderSP value 707 __ lea(r13, Address(rsp, wordSize)); 708 709 #ifdef ASSERT 710 __ check_stack_alignment(r13, "sender stack not aligned"); 711 #endif 712 if (extraspace > 0) { 713 // Pop the return address 714 __ pop(rax); 715 716 __ subptr(rsp, extraspace); 717 718 // Push the return address 719 __ push(rax); 720 721 // Account for the return address location since we store it first rather 722 // than hold it in a register across all the shuffling 723 extraspace += wordSize; 724 } 725 726 #ifdef ASSERT 727 __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax); 728 #endif 729 730 // Now write the args into the outgoing interpreter space 731 for (int i = 0; i < total_args_passed; i++) { 732 if (sig_bt[i] == T_VOID) { 733 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 734 continue; 735 } 736 737 // offset to start parameters 738 int st_off = (total_args_passed - i) * Interpreter::stackElementSize; 739 int next_off = st_off - Interpreter::stackElementSize; 740 741 // Say 4 args: 742 // i st_off 743 // 0 32 T_LONG 744 // 1 24 T_VOID 745 // 2 16 T_OBJECT 746 // 3 8 T_BOOL 747 // - 0 return address 748 // 749 // However to make thing extra confusing. Because we can fit a long/double in 750 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter 751 // leaves one slot empty and only stores to a single slot. In this case the 752 // slot that is occupied is the T_VOID slot. See I said it was confusing. 753 754 VMReg r_1 = regs[i].first(); 755 VMReg r_2 = regs[i].second(); 756 if (!r_1->is_valid()) { 757 assert(!r_2->is_valid(), ""); 758 continue; 759 } 760 if (r_1->is_stack()) { 761 // memory to memory use rax 762 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace; 763 if (!r_2->is_valid()) { 764 // sign extend?? 765 __ movl(rax, Address(rsp, ld_off)); 766 __ movptr(Address(rsp, st_off), rax); 767 768 } else { 769 770 __ movq(rax, Address(rsp, ld_off)); 771 772 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 773 // T_DOUBLE and T_LONG use two slots in the interpreter 774 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 775 // ld_off == LSW, ld_off+wordSize == MSW 776 // st_off == MSW, next_off == LSW 777 __ movq(Address(rsp, next_off), rax); 778 #ifdef ASSERT 779 // Overwrite the unused slot with known junk 780 __ mov64(rax, CONST64(0xdeadffffdeadaaaa)); 781 __ movptr(Address(rsp, st_off), rax); 782 #endif /* ASSERT */ 783 } else { 784 __ movq(Address(rsp, st_off), rax); 785 } 786 } 787 } else if (r_1->is_Register()) { 788 Register r = r_1->as_Register(); 789 if (!r_2->is_valid()) { 790 // must be only an int (or less ) so move only 32bits to slot 791 // why not sign extend?? 792 __ movl(Address(rsp, st_off), r); 793 } else { 794 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 795 // T_DOUBLE and T_LONG use two slots in the interpreter 796 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 797 // long/double in gpr 798 #ifdef ASSERT 799 // Overwrite the unused slot with known junk 800 __ mov64(rax, CONST64(0xdeadffffdeadaaab)); 801 __ movptr(Address(rsp, st_off), rax); 802 #endif /* ASSERT */ 803 __ movq(Address(rsp, next_off), r); 804 } else { 805 __ movptr(Address(rsp, st_off), r); 806 } 807 } 808 } else { 809 assert(r_1->is_XMMRegister(), ""); 810 if (!r_2->is_valid()) { 811 // only a float use just part of the slot 812 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister()); 813 } else { 814 #ifdef ASSERT 815 // Overwrite the unused slot with known junk 816 __ mov64(rax, CONST64(0xdeadffffdeadaaac)); 817 __ movptr(Address(rsp, st_off), rax); 818 #endif /* ASSERT */ 819 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister()); 820 } 821 } 822 } 823 824 // Schedule the branch target address early. 825 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset()))); 826 __ jmp(rcx); 827 } 828 829 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm, 830 int total_args_passed, 831 int comp_args_on_stack, 832 const BasicType *sig_bt, 833 const VMRegPair *regs) { 834 835 // Note: r13 contains the senderSP on entry. We must preserve it since 836 // we may do a i2c -> c2i transition if we lose a race where compiled 837 // code goes non-entrant while we get args ready. 838 // In addition we use r13 to locate all the interpreter args as 839 // we must align the stack to 16 bytes on an i2c entry else we 840 // lose alignment we expect in all compiled code and register 841 // save code can segv when fxsave instructions find improperly 842 // aligned stack pointer. 843 844 // Adapters can be frameless because they do not require the caller 845 // to perform additional cleanup work, such as correcting the stack pointer. 846 // An i2c adapter is frameless because the *caller* frame, which is interpreted, 847 // routinely repairs its own stack pointer (from interpreter_frame_last_sp), 848 // even if a callee has modified the stack pointer. 849 // A c2i adapter is frameless because the *callee* frame, which is interpreted, 850 // routinely repairs its caller's stack pointer (from sender_sp, which is set 851 // up via the senderSP register). 852 // In other words, if *either* the caller or callee is interpreted, we can 853 // get the stack pointer repaired after a call. 854 // This is why c2i and i2c adapters cannot be indefinitely composed. 855 // In particular, if a c2i adapter were to somehow call an i2c adapter, 856 // both caller and callee would be compiled methods, and neither would 857 // clean up the stack pointer changes performed by the two adapters. 858 // If this happens, control eventually transfers back to the compiled 859 // caller, but with an uncorrected stack, causing delayed havoc. 860 861 // Must preserve original SP for loading incoming arguments because 862 // we need to align the outgoing SP for compiled code. 863 __ movptr(r11, rsp); 864 865 // Pick up the return address 866 __ pop(rax); 867 868 // Convert 4-byte c2 stack slots to words. 869 int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord; 870 871 if (comp_args_on_stack) { 872 __ subptr(rsp, comp_words_on_stack * wordSize); 873 } 874 875 // Ensure compiled code always sees stack at proper alignment 876 __ andptr(rsp, -16); 877 878 // push the return address and misalign the stack that youngest frame always sees 879 // as far as the placement of the call instruction 880 __ push(rax); 881 882 // Put saved SP in another register 883 const Register saved_sp = rax; 884 __ movptr(saved_sp, r11); 885 886 // Will jump to the compiled code just as if compiled code was doing it. 887 // Pre-load the register-jump target early, to schedule it better. 888 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset()))); 889 890 #if INCLUDE_JVMCI 891 if (EnableJVMCI) { 892 // check if this call should be routed towards a specific entry point 893 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 894 Label no_alternative_target; 895 __ jcc(Assembler::equal, no_alternative_target); 896 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset()))); 897 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 898 __ bind(no_alternative_target); 899 } 900 #endif // INCLUDE_JVMCI 901 902 // Now generate the shuffle code. Pick up all register args and move the 903 // rest through the floating point stack top. 904 for (int i = 0; i < total_args_passed; i++) { 905 if (sig_bt[i] == T_VOID) { 906 // Longs and doubles are passed in native word order, but misaligned 907 // in the 32-bit build. 908 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 909 continue; 910 } 911 912 // Pick up 0, 1 or 2 words from SP+offset. 913 914 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), 915 "scrambled load targets?"); 916 // Load in argument order going down. 917 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize; 918 // Point to interpreter value (vs. tag) 919 int next_off = ld_off - Interpreter::stackElementSize; 920 // 921 // 922 // 923 VMReg r_1 = regs[i].first(); 924 VMReg r_2 = regs[i].second(); 925 if (!r_1->is_valid()) { 926 assert(!r_2->is_valid(), ""); 927 continue; 928 } 929 if (r_1->is_stack()) { 930 // Convert stack slot to an SP offset (+ wordSize to account for return address ) 931 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize; 932 933 // We can use r13 as a temp here because compiled code doesn't need r13 as an input 934 // and if we end up going thru a c2i because of a miss a reasonable value of r13 935 // will be generated. 936 if (!r_2->is_valid()) { 937 // sign extend??? 938 __ movl(r13, Address(saved_sp, ld_off)); 939 __ movptr(Address(rsp, st_off), r13); 940 } else { 941 // 942 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 943 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 944 // So we must adjust where to pick up the data to match the interpreter. 945 // 946 // Interpreter local[n] == MSW, local[n+1] == LSW however locals 947 // are accessed as negative so LSW is at LOW address 948 949 // ld_off is MSW so get LSW 950 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 951 next_off : ld_off; 952 __ movq(r13, Address(saved_sp, offset)); 953 // st_off is LSW (i.e. reg.first()) 954 __ movq(Address(rsp, st_off), r13); 955 } 956 } else if (r_1->is_Register()) { // Register argument 957 Register r = r_1->as_Register(); 958 assert(r != rax, "must be different"); 959 if (r_2->is_valid()) { 960 // 961 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 962 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 963 // So we must adjust where to pick up the data to match the interpreter. 964 965 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 966 next_off : ld_off; 967 968 // this can be a misaligned move 969 __ movq(r, Address(saved_sp, offset)); 970 } else { 971 // sign extend and use a full word? 972 __ movl(r, Address(saved_sp, ld_off)); 973 } 974 } else { 975 if (!r_2->is_valid()) { 976 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off)); 977 } else { 978 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off)); 979 } 980 } 981 } 982 983 __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about 984 985 // 6243940 We might end up in handle_wrong_method if 986 // the callee is deoptimized as we race thru here. If that 987 // happens we don't want to take a safepoint because the 988 // caller frame will look interpreted and arguments are now 989 // "compiled" so it is much better to make this transition 990 // invisible to the stack walking code. Unfortunately if 991 // we try and find the callee by normal means a safepoint 992 // is possible. So we stash the desired callee in the thread 993 // and the vm will find there should this case occur. 994 995 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx); 996 997 // put Method* where a c2i would expect should we end up there 998 // only needed because eof c2 resolve stubs return Method* as a result in 999 // rax 1000 __ mov(rax, rbx); 1001 __ jmp(r11); 1002 } 1003 1004 // --------------------------------------------------------------- 1005 void SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm, 1006 int total_args_passed, 1007 int comp_args_on_stack, 1008 const BasicType *sig_bt, 1009 const VMRegPair *regs, 1010 AdapterHandlerEntry* handler) { 1011 address i2c_entry = __ pc(); 1012 1013 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs); 1014 1015 // ------------------------------------------------------------------------- 1016 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls 1017 // to the interpreter. The args start out packed in the compiled layout. They 1018 // need to be unpacked into the interpreter layout. This will almost always 1019 // require some stack space. We grow the current (compiled) stack, then repack 1020 // the args. We finally end in a jump to the generic interpreter entry point. 1021 // On exit from the interpreter, the interpreter will restore our SP (lest the 1022 // compiled code, which relies solely on SP and not RBP, get sick). 1023 1024 address c2i_unverified_entry = __ pc(); 1025 Label skip_fixup; 1026 1027 Register data = rax; 1028 Register receiver = j_rarg0; 1029 Register temp = rbx; 1030 1031 { 1032 __ ic_check(1 /* end_alignment */); 1033 __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset())); 1034 // Method might have been compiled since the call site was patched to 1035 // interpreted if that is the case treat it as a miss so we can get 1036 // the call site corrected. 1037 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 1038 __ jcc(Assembler::equal, skip_fixup); 1039 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1040 } 1041 1042 address c2i_entry = __ pc(); 1043 1044 // Class initialization barrier for static methods 1045 address c2i_no_clinit_check_entry = nullptr; 1046 if (VM_Version::supports_fast_class_init_checks()) { 1047 Label L_skip_barrier; 1048 Register method = rbx; 1049 1050 { // Bypass the barrier for non-static methods 1051 Register flags = rscratch1; 1052 __ load_unsigned_short(flags, Address(method, Method::access_flags_offset())); 1053 __ testl(flags, JVM_ACC_STATIC); 1054 __ jcc(Assembler::zero, L_skip_barrier); // non-static 1055 } 1056 1057 Register klass = rscratch1; 1058 __ load_method_holder(klass, method); 1059 __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/); 1060 1061 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1062 1063 __ bind(L_skip_barrier); 1064 c2i_no_clinit_check_entry = __ pc(); 1065 } 1066 1067 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1068 bs->c2i_entry_barrier(masm); 1069 1070 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup); 1071 1072 handler->set_entry_points(i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry); 1073 return; 1074 } 1075 1076 int SharedRuntime::c_calling_convention(const BasicType *sig_bt, 1077 VMRegPair *regs, 1078 int total_args_passed) { 1079 1080 // We return the amount of VMRegImpl stack slots we need to reserve for all 1081 // the arguments NOT counting out_preserve_stack_slots. 1082 1083 // NOTE: These arrays will have to change when c1 is ported 1084 #ifdef _WIN64 1085 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1086 c_rarg0, c_rarg1, c_rarg2, c_rarg3 1087 }; 1088 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1089 c_farg0, c_farg1, c_farg2, c_farg3 1090 }; 1091 #else 1092 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1093 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5 1094 }; 1095 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1096 c_farg0, c_farg1, c_farg2, c_farg3, 1097 c_farg4, c_farg5, c_farg6, c_farg7 1098 }; 1099 #endif // _WIN64 1100 1101 1102 uint int_args = 0; 1103 uint fp_args = 0; 1104 uint stk_args = 0; // inc by 2 each time 1105 1106 for (int i = 0; i < total_args_passed; i++) { 1107 switch (sig_bt[i]) { 1108 case T_BOOLEAN: 1109 case T_CHAR: 1110 case T_BYTE: 1111 case T_SHORT: 1112 case T_INT: 1113 if (int_args < Argument::n_int_register_parameters_c) { 1114 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 1115 #ifdef _WIN64 1116 fp_args++; 1117 // Allocate slots for callee to stuff register args the stack. 1118 stk_args += 2; 1119 #endif 1120 } else { 1121 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1122 stk_args += 2; 1123 } 1124 break; 1125 case T_LONG: 1126 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1127 // fall through 1128 case T_OBJECT: 1129 case T_ARRAY: 1130 case T_ADDRESS: 1131 case T_METADATA: 1132 if (int_args < Argument::n_int_register_parameters_c) { 1133 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 1134 #ifdef _WIN64 1135 fp_args++; 1136 stk_args += 2; 1137 #endif 1138 } else { 1139 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1140 stk_args += 2; 1141 } 1142 break; 1143 case T_FLOAT: 1144 if (fp_args < Argument::n_float_register_parameters_c) { 1145 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 1146 #ifdef _WIN64 1147 int_args++; 1148 // Allocate slots for callee to stuff register args the stack. 1149 stk_args += 2; 1150 #endif 1151 } else { 1152 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1153 stk_args += 2; 1154 } 1155 break; 1156 case T_DOUBLE: 1157 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1158 if (fp_args < Argument::n_float_register_parameters_c) { 1159 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 1160 #ifdef _WIN64 1161 int_args++; 1162 // Allocate slots for callee to stuff register args the stack. 1163 stk_args += 2; 1164 #endif 1165 } else { 1166 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1167 stk_args += 2; 1168 } 1169 break; 1170 case T_VOID: // Halves of longs and doubles 1171 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 1172 regs[i].set_bad(); 1173 break; 1174 default: 1175 ShouldNotReachHere(); 1176 break; 1177 } 1178 } 1179 #ifdef _WIN64 1180 // windows abi requires that we always allocate enough stack space 1181 // for 4 64bit registers to be stored down. 1182 if (stk_args < 8) { 1183 stk_args = 8; 1184 } 1185 #endif // _WIN64 1186 1187 return stk_args; 1188 } 1189 1190 int SharedRuntime::vector_calling_convention(VMRegPair *regs, 1191 uint num_bits, 1192 uint total_args_passed) { 1193 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512, 1194 "only certain vector sizes are supported for now"); 1195 1196 static const XMMRegister VEC_ArgReg[32] = { 1197 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 1198 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1199 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23, 1200 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31 1201 }; 1202 1203 uint stk_args = 0; 1204 uint fp_args = 0; 1205 1206 for (uint i = 0; i < total_args_passed; i++) { 1207 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg(); 1208 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15)); 1209 regs[i].set_pair(vmreg->next(next_val), vmreg); 1210 } 1211 1212 return stk_args; 1213 } 1214 1215 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1216 // We always ignore the frame_slots arg and just use the space just below frame pointer 1217 // which by this time is free to use 1218 switch (ret_type) { 1219 case T_FLOAT: 1220 __ movflt(Address(rbp, -wordSize), xmm0); 1221 break; 1222 case T_DOUBLE: 1223 __ movdbl(Address(rbp, -wordSize), xmm0); 1224 break; 1225 case T_VOID: break; 1226 default: { 1227 __ movptr(Address(rbp, -wordSize), rax); 1228 } 1229 } 1230 } 1231 1232 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1233 // We always ignore the frame_slots arg and just use the space just below frame pointer 1234 // which by this time is free to use 1235 switch (ret_type) { 1236 case T_FLOAT: 1237 __ movflt(xmm0, Address(rbp, -wordSize)); 1238 break; 1239 case T_DOUBLE: 1240 __ movdbl(xmm0, Address(rbp, -wordSize)); 1241 break; 1242 case T_VOID: break; 1243 default: { 1244 __ movptr(rax, Address(rbp, -wordSize)); 1245 } 1246 } 1247 } 1248 1249 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1250 for ( int i = first_arg ; i < arg_count ; i++ ) { 1251 if (args[i].first()->is_Register()) { 1252 __ push(args[i].first()->as_Register()); 1253 } else if (args[i].first()->is_XMMRegister()) { 1254 __ subptr(rsp, 2*wordSize); 1255 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister()); 1256 } 1257 } 1258 } 1259 1260 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1261 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) { 1262 if (args[i].first()->is_Register()) { 1263 __ pop(args[i].first()->as_Register()); 1264 } else if (args[i].first()->is_XMMRegister()) { 1265 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0)); 1266 __ addptr(rsp, 2*wordSize); 1267 } 1268 } 1269 } 1270 1271 static void verify_oop_args(MacroAssembler* masm, 1272 const methodHandle& method, 1273 const BasicType* sig_bt, 1274 const VMRegPair* regs) { 1275 Register temp_reg = rbx; // not part of any compiled calling seq 1276 if (VerifyOops) { 1277 for (int i = 0; i < method->size_of_parameters(); i++) { 1278 if (is_reference_type(sig_bt[i])) { 1279 VMReg r = regs[i].first(); 1280 assert(r->is_valid(), "bad oop arg"); 1281 if (r->is_stack()) { 1282 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1283 __ verify_oop(temp_reg); 1284 } else { 1285 __ verify_oop(r->as_Register()); 1286 } 1287 } 1288 } 1289 } 1290 } 1291 1292 static void check_continuation_enter_argument(VMReg actual_vmreg, 1293 Register expected_reg, 1294 const char* name) { 1295 assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name); 1296 assert(actual_vmreg->as_Register() == expected_reg, 1297 "%s is in unexpected register: %s instead of %s", 1298 name, actual_vmreg->as_Register()->name(), expected_reg->name()); 1299 } 1300 1301 1302 //---------------------------- continuation_enter_setup --------------------------- 1303 // 1304 // Arguments: 1305 // None. 1306 // 1307 // Results: 1308 // rsp: pointer to blank ContinuationEntry 1309 // 1310 // Kills: 1311 // rax 1312 // 1313 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) { 1314 assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, ""); 1315 assert(in_bytes(ContinuationEntry::cont_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1316 assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1317 1318 stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize; 1319 __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1320 1321 int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size; 1322 OopMap* map = new OopMap(frame_size, 0); 1323 1324 __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset())); 1325 __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax); 1326 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp); 1327 1328 return map; 1329 } 1330 1331 //---------------------------- fill_continuation_entry --------------------------- 1332 // 1333 // Arguments: 1334 // rsp: pointer to blank Continuation entry 1335 // reg_cont_obj: pointer to the continuation 1336 // reg_flags: flags 1337 // 1338 // Results: 1339 // rsp: pointer to filled out ContinuationEntry 1340 // 1341 // Kills: 1342 // rax 1343 // 1344 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) { 1345 assert_different_registers(rax, reg_cont_obj, reg_flags); 1346 #ifdef ASSERT 1347 __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value()); 1348 #endif 1349 __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj); 1350 __ movl (Address(rsp, ContinuationEntry::flags_offset()), reg_flags); 1351 __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0); 1352 __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0); 1353 __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0); 1354 1355 __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset())); 1356 __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax); 1357 __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset())); 1358 __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax); 1359 1360 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0); 1361 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0); 1362 } 1363 1364 //---------------------------- continuation_enter_cleanup --------------------------- 1365 // 1366 // Arguments: 1367 // rsp: pointer to the ContinuationEntry 1368 // 1369 // Results: 1370 // rsp: pointer to the spilled rbp in the entry frame 1371 // 1372 // Kills: 1373 // rbx 1374 // 1375 static void continuation_enter_cleanup(MacroAssembler* masm) { 1376 #ifdef ASSERT 1377 Label L_good_sp; 1378 __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1379 __ jcc(Assembler::equal, L_good_sp); 1380 __ stop("Incorrect rsp at continuation_enter_cleanup"); 1381 __ bind(L_good_sp); 1382 #endif 1383 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset())); 1384 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx); 1385 1386 if (CheckJNICalls) { 1387 // Check if this is a virtual thread continuation 1388 Label L_skip_vthread_code; 1389 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0); 1390 __ jcc(Assembler::equal, L_skip_vthread_code); 1391 1392 // If the held monitor count is > 0 and this vthread is terminating then 1393 // it failed to release a JNI monitor. So we issue the same log message 1394 // that JavaThread::exit does. 1395 __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1396 __ jcc(Assembler::equal, L_skip_vthread_code); 1397 1398 // rax may hold an exception oop, save it before the call 1399 __ push(rax); 1400 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held)); 1401 __ pop(rax); 1402 1403 // For vthreads we have to explicitly zero the JNI monitor count of the carrier 1404 // on termination. The held count is implicitly zeroed below when we restore from 1405 // the parent held count (which has to be zero). 1406 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1407 1408 __ bind(L_skip_vthread_code); 1409 } 1410 #ifdef ASSERT 1411 else { 1412 // Check if this is a virtual thread continuation 1413 Label L_skip_vthread_code; 1414 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0); 1415 __ jcc(Assembler::equal, L_skip_vthread_code); 1416 1417 // See comment just above. If not checking JNI calls the JNI count is only 1418 // needed for assertion checking. 1419 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1420 1421 __ bind(L_skip_vthread_code); 1422 } 1423 #endif 1424 1425 __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset())); 1426 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx); 1427 1428 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset())); 1429 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx); 1430 __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1431 } 1432 1433 static void gen_continuation_enter(MacroAssembler* masm, 1434 const VMRegPair* regs, 1435 int& exception_offset, 1436 OopMapSet* oop_maps, 1437 int& frame_complete, 1438 int& stack_slots, 1439 int& interpreted_entry_offset, 1440 int& compiled_entry_offset) { 1441 1442 // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread) 1443 int pos_cont_obj = 0; 1444 int pos_is_cont = 1; 1445 int pos_is_virtual = 2; 1446 1447 // The platform-specific calling convention may present the arguments in various registers. 1448 // To simplify the rest of the code, we expect the arguments to reside at these known 1449 // registers, and we additionally check the placement here in case calling convention ever 1450 // changes. 1451 Register reg_cont_obj = c_rarg1; 1452 Register reg_is_cont = c_rarg2; 1453 Register reg_is_virtual = c_rarg3; 1454 1455 check_continuation_enter_argument(regs[pos_cont_obj].first(), reg_cont_obj, "Continuation object"); 1456 check_continuation_enter_argument(regs[pos_is_cont].first(), reg_is_cont, "isContinue"); 1457 check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread"); 1458 1459 // Utility methods kill rax, make sure there are no collisions 1460 assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual); 1461 1462 AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(), 1463 relocInfo::static_call_type); 1464 1465 address start = __ pc(); 1466 1467 Label L_thaw, L_exit; 1468 1469 // i2i entry used at interp_only_mode only 1470 interpreted_entry_offset = __ pc() - start; 1471 { 1472 #ifdef ASSERT 1473 Label is_interp_only; 1474 __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0); 1475 __ jcc(Assembler::notEqual, is_interp_only); 1476 __ stop("enterSpecial interpreter entry called when not in interp_only_mode"); 1477 __ bind(is_interp_only); 1478 #endif 1479 1480 __ pop(rax); // return address 1481 // Read interpreter arguments into registers (this is an ad-hoc i2c adapter) 1482 __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2)); 1483 __ movl(c_rarg2, Address(rsp, Interpreter::stackElementSize*1)); 1484 __ movl(c_rarg3, Address(rsp, Interpreter::stackElementSize*0)); 1485 __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment 1486 __ push(rax); // return address 1487 __ push_cont_fastpath(); 1488 1489 __ enter(); 1490 1491 stack_slots = 2; // will be adjusted in setup 1492 OopMap* map = continuation_enter_setup(masm, stack_slots); 1493 // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe, 1494 // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway. 1495 1496 __ verify_oop(reg_cont_obj); 1497 1498 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1499 1500 // If continuation, call to thaw. Otherwise, resolve the call and exit. 1501 __ testptr(reg_is_cont, reg_is_cont); 1502 __ jcc(Assembler::notZero, L_thaw); 1503 1504 // --- Resolve path 1505 1506 // Make sure the call is patchable 1507 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1508 // Emit stub for static call 1509 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc()); 1510 if (stub == nullptr) { 1511 fatal("CodeCache is full at gen_continuation_enter"); 1512 } 1513 __ call(resolve); 1514 oop_maps->add_gc_map(__ pc() - start, map); 1515 __ post_call_nop(); 1516 1517 __ jmp(L_exit); 1518 } 1519 1520 // compiled entry 1521 __ align(CodeEntryAlignment); 1522 compiled_entry_offset = __ pc() - start; 1523 __ enter(); 1524 1525 stack_slots = 2; // will be adjusted in setup 1526 OopMap* map = continuation_enter_setup(masm, stack_slots); 1527 1528 // Frame is now completed as far as size and linkage. 1529 frame_complete = __ pc() - start; 1530 1531 __ verify_oop(reg_cont_obj); 1532 1533 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1534 1535 // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue) 1536 __ testptr(reg_is_cont, reg_is_cont); 1537 __ jccb(Assembler::notZero, L_thaw); 1538 1539 // --- call Continuation.enter(Continuation c, boolean isContinue) 1540 1541 // Make sure the call is patchable 1542 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1543 1544 // Emit stub for static call 1545 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc()); 1546 if (stub == nullptr) { 1547 fatal("CodeCache is full at gen_continuation_enter"); 1548 } 1549 1550 // The call needs to be resolved. There's a special case for this in 1551 // SharedRuntime::find_callee_info_helper() which calls 1552 // LinkResolver::resolve_continuation_enter() which resolves the call to 1553 // Continuation.enter(Continuation c, boolean isContinue). 1554 __ call(resolve); 1555 1556 oop_maps->add_gc_map(__ pc() - start, map); 1557 __ post_call_nop(); 1558 1559 __ jmpb(L_exit); 1560 1561 // --- Thawing path 1562 1563 __ bind(L_thaw); 1564 1565 ContinuationEntry::_thaw_call_pc_offset = __ pc() - start; 1566 __ call(RuntimeAddress(StubRoutines::cont_thaw())); 1567 1568 ContinuationEntry::_return_pc_offset = __ pc() - start; 1569 oop_maps->add_gc_map(__ pc() - start, map->deep_copy()); 1570 __ post_call_nop(); 1571 1572 // --- Normal exit (resolve/thawing) 1573 1574 __ bind(L_exit); 1575 ContinuationEntry::_cleanup_offset = __ pc() - start; 1576 continuation_enter_cleanup(masm); 1577 __ pop(rbp); 1578 __ ret(0); 1579 1580 // --- Exception handling path 1581 1582 exception_offset = __ pc() - start; 1583 1584 continuation_enter_cleanup(masm); 1585 __ pop(rbp); 1586 1587 __ movptr(c_rarg0, r15_thread); 1588 __ movptr(c_rarg1, Address(rsp, 0)); // return address 1589 1590 // rax still holds the original exception oop, save it before the call 1591 __ push(rax); 1592 1593 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2); 1594 __ movptr(rbx, rax); 1595 1596 // Continue at exception handler: 1597 // rax: exception oop 1598 // rbx: exception handler 1599 // rdx: exception pc 1600 __ pop(rax); 1601 __ verify_oop(rax); 1602 __ pop(rdx); 1603 __ jmp(rbx); 1604 } 1605 1606 static void gen_continuation_yield(MacroAssembler* masm, 1607 const VMRegPair* regs, 1608 OopMapSet* oop_maps, 1609 int& frame_complete, 1610 int& stack_slots, 1611 int& compiled_entry_offset) { 1612 enum layout { 1613 rbp_off, 1614 rbpH_off, 1615 return_off, 1616 return_off2, 1617 framesize // inclusive of return address 1618 }; 1619 stack_slots = framesize / VMRegImpl::slots_per_word; 1620 assert(stack_slots == 2, "recheck layout"); 1621 1622 address start = __ pc(); 1623 compiled_entry_offset = __ pc() - start; 1624 __ enter(); 1625 address the_pc = __ pc(); 1626 1627 frame_complete = the_pc - start; 1628 1629 // This nop must be exactly at the PC we push into the frame info. 1630 // We use this nop for fast CodeBlob lookup, associate the OopMap 1631 // with it right away. 1632 __ post_call_nop(); 1633 OopMap* map = new OopMap(framesize, 1); 1634 oop_maps->add_gc_map(frame_complete, map); 1635 1636 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 1637 __ movptr(c_rarg0, r15_thread); 1638 __ movptr(c_rarg1, rsp); 1639 __ call_VM_leaf(Continuation::freeze_entry(), 2); 1640 __ reset_last_Java_frame(true); 1641 1642 Label L_pinned; 1643 1644 __ testptr(rax, rax); 1645 __ jcc(Assembler::notZero, L_pinned); 1646 1647 __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1648 continuation_enter_cleanup(masm); 1649 __ pop(rbp); 1650 __ ret(0); 1651 1652 __ bind(L_pinned); 1653 1654 // Pinned, return to caller 1655 1656 // handle pending exception thrown by freeze 1657 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 1658 Label ok; 1659 __ jcc(Assembler::equal, ok); 1660 __ leave(); 1661 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 1662 __ bind(ok); 1663 1664 __ leave(); 1665 __ ret(0); 1666 } 1667 1668 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) { 1669 ::continuation_enter_cleanup(masm); 1670 } 1671 1672 static void gen_special_dispatch(MacroAssembler* masm, 1673 const methodHandle& method, 1674 const BasicType* sig_bt, 1675 const VMRegPair* regs) { 1676 verify_oop_args(masm, method, sig_bt, regs); 1677 vmIntrinsics::ID iid = method->intrinsic_id(); 1678 1679 // Now write the args into the outgoing interpreter space 1680 bool has_receiver = false; 1681 Register receiver_reg = noreg; 1682 int member_arg_pos = -1; 1683 Register member_reg = noreg; 1684 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid); 1685 if (ref_kind != 0) { 1686 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument 1687 member_reg = rbx; // known to be free at this point 1688 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind); 1689 } else if (iid == vmIntrinsics::_invokeBasic) { 1690 has_receiver = true; 1691 } else if (iid == vmIntrinsics::_linkToNative) { 1692 member_arg_pos = method->size_of_parameters() - 1; // trailing NativeEntryPoint argument 1693 member_reg = rbx; // known to be free at this point 1694 } else { 1695 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid)); 1696 } 1697 1698 if (member_reg != noreg) { 1699 // Load the member_arg into register, if necessary. 1700 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs); 1701 VMReg r = regs[member_arg_pos].first(); 1702 if (r->is_stack()) { 1703 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1704 } else { 1705 // no data motion is needed 1706 member_reg = r->as_Register(); 1707 } 1708 } 1709 1710 if (has_receiver) { 1711 // Make sure the receiver is loaded into a register. 1712 assert(method->size_of_parameters() > 0, "oob"); 1713 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object"); 1714 VMReg r = regs[0].first(); 1715 assert(r->is_valid(), "bad receiver arg"); 1716 if (r->is_stack()) { 1717 // Porting note: This assumes that compiled calling conventions always 1718 // pass the receiver oop in a register. If this is not true on some 1719 // platform, pick a temp and load the receiver from stack. 1720 fatal("receiver always in a register"); 1721 receiver_reg = j_rarg0; // known to be free at this point 1722 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1723 } else { 1724 // no data motion is needed 1725 receiver_reg = r->as_Register(); 1726 } 1727 } 1728 1729 // Figure out which address we are really jumping to: 1730 MethodHandles::generate_method_handle_dispatch(masm, iid, 1731 receiver_reg, member_reg, /*for_compiler_entry:*/ true); 1732 } 1733 1734 // --------------------------------------------------------------------------- 1735 // Generate a native wrapper for a given method. The method takes arguments 1736 // in the Java compiled code convention, marshals them to the native 1737 // convention (handlizes oops, etc), transitions to native, makes the call, 1738 // returns to java state (possibly blocking), unhandlizes any result and 1739 // returns. 1740 // 1741 // Critical native functions are a shorthand for the use of 1742 // GetPrimtiveArrayCritical and disallow the use of any other JNI 1743 // functions. The wrapper is expected to unpack the arguments before 1744 // passing them to the callee. Critical native functions leave the state _in_Java, 1745 // since they cannot stop for GC. 1746 // Some other parts of JNI setup are skipped like the tear down of the JNI handle 1747 // block and the check for pending exceptions it's impossible for them 1748 // to be thrown. 1749 // 1750 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm, 1751 const methodHandle& method, 1752 int compile_id, 1753 BasicType* in_sig_bt, 1754 VMRegPair* in_regs, 1755 BasicType ret_type) { 1756 if (method->is_continuation_native_intrinsic()) { 1757 int exception_offset = -1; 1758 OopMapSet* oop_maps = new OopMapSet(); 1759 int frame_complete = -1; 1760 int stack_slots = -1; 1761 int interpreted_entry_offset = -1; 1762 int vep_offset = -1; 1763 if (method->is_continuation_enter_intrinsic()) { 1764 gen_continuation_enter(masm, 1765 in_regs, 1766 exception_offset, 1767 oop_maps, 1768 frame_complete, 1769 stack_slots, 1770 interpreted_entry_offset, 1771 vep_offset); 1772 } else if (method->is_continuation_yield_intrinsic()) { 1773 gen_continuation_yield(masm, 1774 in_regs, 1775 oop_maps, 1776 frame_complete, 1777 stack_slots, 1778 vep_offset); 1779 } else { 1780 guarantee(false, "Unknown Continuation native intrinsic"); 1781 } 1782 1783 #ifdef ASSERT 1784 if (method->is_continuation_enter_intrinsic()) { 1785 assert(interpreted_entry_offset != -1, "Must be set"); 1786 assert(exception_offset != -1, "Must be set"); 1787 } else { 1788 assert(interpreted_entry_offset == -1, "Must be unset"); 1789 assert(exception_offset == -1, "Must be unset"); 1790 } 1791 assert(frame_complete != -1, "Must be set"); 1792 assert(stack_slots != -1, "Must be set"); 1793 assert(vep_offset != -1, "Must be set"); 1794 #endif 1795 1796 __ flush(); 1797 nmethod* nm = nmethod::new_native_nmethod(method, 1798 compile_id, 1799 masm->code(), 1800 vep_offset, 1801 frame_complete, 1802 stack_slots, 1803 in_ByteSize(-1), 1804 in_ByteSize(-1), 1805 oop_maps, 1806 exception_offset); 1807 if (nm == nullptr) return nm; 1808 if (method->is_continuation_enter_intrinsic()) { 1809 ContinuationEntry::set_enter_code(nm, interpreted_entry_offset); 1810 } else if (method->is_continuation_yield_intrinsic()) { 1811 ContinuationEntry::set_yield_code(nm); 1812 } 1813 return nm; 1814 } 1815 1816 if (method->is_method_handle_intrinsic()) { 1817 vmIntrinsics::ID iid = method->intrinsic_id(); 1818 intptr_t start = (intptr_t)__ pc(); 1819 int vep_offset = ((intptr_t)__ pc()) - start; 1820 gen_special_dispatch(masm, 1821 method, 1822 in_sig_bt, 1823 in_regs); 1824 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period 1825 __ flush(); 1826 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually 1827 return nmethod::new_native_nmethod(method, 1828 compile_id, 1829 masm->code(), 1830 vep_offset, 1831 frame_complete, 1832 stack_slots / VMRegImpl::slots_per_word, 1833 in_ByteSize(-1), 1834 in_ByteSize(-1), 1835 nullptr); 1836 } 1837 address native_func = method->native_function(); 1838 assert(native_func != nullptr, "must have function"); 1839 1840 // An OopMap for lock (and class if static) 1841 OopMapSet *oop_maps = new OopMapSet(); 1842 intptr_t start = (intptr_t)__ pc(); 1843 1844 // We have received a description of where all the java arg are located 1845 // on entry to the wrapper. We need to convert these args to where 1846 // the jni function will expect them. To figure out where they go 1847 // we convert the java signature to a C signature by inserting 1848 // the hidden arguments as arg[0] and possibly arg[1] (static method) 1849 1850 const int total_in_args = method->size_of_parameters(); 1851 int total_c_args = total_in_args + (method->is_static() ? 2 : 1); 1852 1853 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args); 1854 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args); 1855 1856 int argc = 0; 1857 out_sig_bt[argc++] = T_ADDRESS; 1858 if (method->is_static()) { 1859 out_sig_bt[argc++] = T_OBJECT; 1860 } 1861 1862 for (int i = 0; i < total_in_args ; i++ ) { 1863 out_sig_bt[argc++] = in_sig_bt[i]; 1864 } 1865 1866 // Now figure out where the args must be stored and how much stack space 1867 // they require. 1868 int out_arg_slots; 1869 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args); 1870 1871 // Compute framesize for the wrapper. We need to handlize all oops in 1872 // incoming registers 1873 1874 // Calculate the total number of stack slots we will need. 1875 1876 // First count the abi requirement plus all of the outgoing args 1877 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots; 1878 1879 // Now the space for the inbound oop handle area 1880 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers 1881 1882 int oop_handle_offset = stack_slots; 1883 stack_slots += total_save_slots; 1884 1885 // Now any space we need for handlizing a klass if static method 1886 1887 int klass_slot_offset = 0; 1888 int klass_offset = -1; 1889 int lock_slot_offset = 0; 1890 bool is_static = false; 1891 1892 if (method->is_static()) { 1893 klass_slot_offset = stack_slots; 1894 stack_slots += VMRegImpl::slots_per_word; 1895 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size; 1896 is_static = true; 1897 } 1898 1899 // Plus a lock if needed 1900 1901 if (method->is_synchronized()) { 1902 lock_slot_offset = stack_slots; 1903 stack_slots += VMRegImpl::slots_per_word; 1904 } 1905 1906 // Now a place (+2) to save return values or temp during shuffling 1907 // + 4 for return address (which we own) and saved rbp 1908 stack_slots += 6; 1909 1910 // Ok The space we have allocated will look like: 1911 // 1912 // 1913 // FP-> | | 1914 // |---------------------| 1915 // | 2 slots for moves | 1916 // |---------------------| 1917 // | lock box (if sync) | 1918 // |---------------------| <- lock_slot_offset 1919 // | klass (if static) | 1920 // |---------------------| <- klass_slot_offset 1921 // | oopHandle area | 1922 // |---------------------| <- oop_handle_offset (6 java arg registers) 1923 // | outbound memory | 1924 // | based arguments | 1925 // | | 1926 // |---------------------| 1927 // | | 1928 // SP-> | out_preserved_slots | 1929 // 1930 // 1931 1932 1933 // Now compute actual number of stack words we need rounding to make 1934 // stack properly aligned. 1935 stack_slots = align_up(stack_slots, StackAlignmentInSlots); 1936 1937 int stack_size = stack_slots * VMRegImpl::stack_slot_size; 1938 1939 // First thing make an ic check to see if we should even be here 1940 1941 // We are free to use all registers as temps without saving them and 1942 // restoring them except rbp. rbp is the only callee save register 1943 // as far as the interpreter and the compiler(s) are concerned. 1944 1945 const Register receiver = j_rarg0; 1946 1947 Label exception_pending; 1948 1949 assert_different_registers(receiver, rscratch1, rscratch2); 1950 __ verify_oop(receiver); 1951 __ ic_check(8 /* end_alignment */); 1952 1953 int vep_offset = ((intptr_t)__ pc()) - start; 1954 1955 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) { 1956 Label L_skip_barrier; 1957 Register klass = r10; 1958 __ mov_metadata(klass, method->method_holder()); // InstanceKlass* 1959 __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/); 1960 1961 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1962 1963 __ bind(L_skip_barrier); 1964 } 1965 1966 #ifdef COMPILER1 1967 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available. 1968 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) { 1969 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/); 1970 } 1971 #endif // COMPILER1 1972 1973 // The instruction at the verified entry point must be 5 bytes or longer 1974 // because it can be patched on the fly by make_non_entrant. The stack bang 1975 // instruction fits that requirement. 1976 1977 // Generate stack overflow check 1978 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size()); 1979 1980 // Generate a new frame for the wrapper. 1981 __ enter(); 1982 // -2 because return address is already present and so is saved rbp 1983 __ subptr(rsp, stack_size - 2*wordSize); 1984 1985 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1986 // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub 1987 bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */); 1988 1989 // Frame is now completed as far as size and linkage. 1990 int frame_complete = ((intptr_t)__ pc()) - start; 1991 1992 #ifdef ASSERT 1993 __ check_stack_alignment(rsp, "improperly aligned stack"); 1994 #endif /* ASSERT */ 1995 1996 1997 // We use r14 as the oop handle for the receiver/klass 1998 // It is callee save so it survives the call to native 1999 2000 const Register oop_handle_reg = r14; 2001 2002 // 2003 // We immediately shuffle the arguments so that any vm call we have to 2004 // make from here on out (sync slow path, jvmti, etc.) we will have 2005 // captured the oops from our caller and have a valid oopMap for 2006 // them. 2007 2008 // ----------------- 2009 // The Grand Shuffle 2010 2011 // The Java calling convention is either equal (linux) or denser (win64) than the 2012 // c calling convention. However the because of the jni_env argument the c calling 2013 // convention always has at least one more (and two for static) arguments than Java. 2014 // Therefore if we move the args from java -> c backwards then we will never have 2015 // a register->register conflict and we don't have to build a dependency graph 2016 // and figure out how to break any cycles. 2017 // 2018 2019 // Record esp-based slot for receiver on stack for non-static methods 2020 int receiver_offset = -1; 2021 2022 // This is a trick. We double the stack slots so we can claim 2023 // the oops in the caller's frame. Since we are sure to have 2024 // more args than the caller doubling is enough to make 2025 // sure we can capture all the incoming oop args from the 2026 // caller. 2027 // 2028 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/); 2029 2030 // Mark location of rbp (someday) 2031 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp)); 2032 2033 // Use eax, ebx as temporaries during any memory-memory moves we have to do 2034 // All inbound args are referenced based on rbp and all outbound args via rsp. 2035 2036 2037 #ifdef ASSERT 2038 bool reg_destroyed[Register::number_of_registers]; 2039 bool freg_destroyed[XMMRegister::number_of_registers]; 2040 for ( int r = 0 ; r < Register::number_of_registers ; r++ ) { 2041 reg_destroyed[r] = false; 2042 } 2043 for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) { 2044 freg_destroyed[f] = false; 2045 } 2046 2047 #endif /* ASSERT */ 2048 2049 // For JNI natives the incoming and outgoing registers are offset upwards. 2050 GrowableArray<int> arg_order(2 * total_in_args); 2051 2052 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 2053 arg_order.push(i); 2054 arg_order.push(c_arg); 2055 } 2056 2057 for (int ai = 0; ai < arg_order.length(); ai += 2) { 2058 int i = arg_order.at(ai); 2059 int c_arg = arg_order.at(ai + 1); 2060 __ block_comment(err_msg("move %d -> %d", i, c_arg)); 2061 #ifdef ASSERT 2062 if (in_regs[i].first()->is_Register()) { 2063 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!"); 2064 } else if (in_regs[i].first()->is_XMMRegister()) { 2065 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!"); 2066 } 2067 if (out_regs[c_arg].first()->is_Register()) { 2068 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 2069 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 2070 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 2071 } 2072 #endif /* ASSERT */ 2073 switch (in_sig_bt[i]) { 2074 case T_ARRAY: 2075 case T_OBJECT: 2076 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg], 2077 ((i == 0) && (!is_static)), 2078 &receiver_offset); 2079 break; 2080 case T_VOID: 2081 break; 2082 2083 case T_FLOAT: 2084 __ float_move(in_regs[i], out_regs[c_arg]); 2085 break; 2086 2087 case T_DOUBLE: 2088 assert( i + 1 < total_in_args && 2089 in_sig_bt[i + 1] == T_VOID && 2090 out_sig_bt[c_arg+1] == T_VOID, "bad arg list"); 2091 __ double_move(in_regs[i], out_regs[c_arg]); 2092 break; 2093 2094 case T_LONG : 2095 __ long_move(in_regs[i], out_regs[c_arg]); 2096 break; 2097 2098 case T_ADDRESS: assert(false, "found T_ADDRESS in java args"); 2099 2100 default: 2101 __ move32_64(in_regs[i], out_regs[c_arg]); 2102 } 2103 } 2104 2105 int c_arg; 2106 2107 // Pre-load a static method's oop into r14. Used both by locking code and 2108 // the normal JNI call code. 2109 // point c_arg at the first arg that is already loaded in case we 2110 // need to spill before we call out 2111 c_arg = total_c_args - total_in_args; 2112 2113 if (method->is_static()) { 2114 2115 // load oop into a register 2116 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror())); 2117 2118 // Now handlize the static class mirror it's known not-null. 2119 __ movptr(Address(rsp, klass_offset), oop_handle_reg); 2120 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset)); 2121 2122 // Now get the handle 2123 __ lea(oop_handle_reg, Address(rsp, klass_offset)); 2124 // store the klass handle as second argument 2125 __ movptr(c_rarg1, oop_handle_reg); 2126 // and protect the arg if we must spill 2127 c_arg--; 2128 } 2129 2130 // Change state to native (we save the return address in the thread, since it might not 2131 // be pushed on the stack when we do a stack traversal). It is enough that the pc() 2132 // points into the right code segment. It does not have to be the correct return pc. 2133 // We use the same pc/oopMap repeatedly when we call out 2134 2135 Label native_return; 2136 if (method->is_object_wait0()) { 2137 // For convenience we use the pc we want to resume to in case of preemption on Object.wait. 2138 __ set_last_Java_frame(rsp, noreg, native_return, rscratch1); 2139 } else { 2140 intptr_t the_pc = (intptr_t) __ pc(); 2141 oop_maps->add_gc_map(the_pc - start, map); 2142 2143 __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1); 2144 } 2145 2146 // We have all of the arguments setup at this point. We must not touch any register 2147 // argument registers at this point (what if we save/restore them there are no oop? 2148 2149 if (DTraceMethodProbes) { 2150 // protect the args we've loaded 2151 save_args(masm, total_c_args, c_arg, out_regs); 2152 __ mov_metadata(c_rarg1, method()); 2153 __ call_VM_leaf( 2154 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry), 2155 r15_thread, c_rarg1); 2156 restore_args(masm, total_c_args, c_arg, out_regs); 2157 } 2158 2159 // RedefineClasses() tracing support for obsolete method entry 2160 if (log_is_enabled(Trace, redefine, class, obsolete)) { 2161 // protect the args we've loaded 2162 save_args(masm, total_c_args, c_arg, out_regs); 2163 __ mov_metadata(c_rarg1, method()); 2164 __ call_VM_leaf( 2165 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry), 2166 r15_thread, c_rarg1); 2167 restore_args(masm, total_c_args, c_arg, out_regs); 2168 } 2169 2170 // Lock a synchronized method 2171 2172 // Register definitions used by locking and unlocking 2173 2174 const Register swap_reg = rax; // Must use rax for cmpxchg instruction 2175 const Register obj_reg = rbx; // Will contain the oop 2176 const Register lock_reg = r13; // Address of compiler lock object (BasicLock) 2177 2178 Label slow_path_lock; 2179 Label lock_done; 2180 2181 if (method->is_synchronized()) { 2182 // Get the handle (the 2nd argument) 2183 __ mov(oop_handle_reg, c_rarg1); 2184 2185 // Get address of the box 2186 2187 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2188 2189 // Load the oop from the handle 2190 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2191 2192 __ lightweight_lock(lock_reg, obj_reg, swap_reg, rscratch1, slow_path_lock); 2193 2194 // Slow path will re-enter here 2195 __ bind(lock_done); 2196 } 2197 2198 // Finally just about ready to make the JNI call 2199 2200 // get JNIEnv* which is first argument to native 2201 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset()))); 2202 2203 // Now set thread in native 2204 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 2205 2206 __ call(RuntimeAddress(native_func)); 2207 2208 // Verify or restore cpu control state after JNI call 2209 __ restore_cpu_control_state_after_jni(rscratch1); 2210 2211 // Unpack native results. 2212 switch (ret_type) { 2213 case T_BOOLEAN: __ c2bool(rax); break; 2214 case T_CHAR : __ movzwl(rax, rax); break; 2215 case T_BYTE : __ sign_extend_byte (rax); break; 2216 case T_SHORT : __ sign_extend_short(rax); break; 2217 case T_INT : /* nothing to do */ break; 2218 case T_DOUBLE : 2219 case T_FLOAT : 2220 // Result is in xmm0 we'll save as needed 2221 break; 2222 case T_ARRAY: // Really a handle 2223 case T_OBJECT: // Really a handle 2224 break; // can't de-handlize until after safepoint check 2225 case T_VOID: break; 2226 case T_LONG: break; 2227 default : ShouldNotReachHere(); 2228 } 2229 2230 // Switch thread to "native transition" state before reading the synchronization state. 2231 // This additional state is necessary because reading and testing the synchronization 2232 // state is not atomic w.r.t. GC, as this scenario demonstrates: 2233 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted. 2234 // VM thread changes sync state to synchronizing and suspends threads for GC. 2235 // Thread A is resumed to finish this native method, but doesn't block here since it 2236 // didn't see any synchronization is progress, and escapes. 2237 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 2238 2239 // Force this write out before the read below 2240 if (!UseSystemMemoryBarrier) { 2241 __ membar(Assembler::Membar_mask_bits( 2242 Assembler::LoadLoad | Assembler::LoadStore | 2243 Assembler::StoreLoad | Assembler::StoreStore)); 2244 } 2245 2246 // check for safepoint operation in progress and/or pending suspend requests 2247 { 2248 Label Continue; 2249 Label slow_path; 2250 2251 __ safepoint_poll(slow_path, true /* at_return */, false /* in_nmethod */); 2252 2253 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2254 __ jcc(Assembler::equal, Continue); 2255 __ bind(slow_path); 2256 2257 // Don't use call_VM as it will see a possible pending exception and forward it 2258 // and never return here preventing us from clearing _last_native_pc down below. 2259 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are 2260 // preserved and correspond to the bcp/locals pointers. So we do a runtime call 2261 // by hand. 2262 // 2263 __ vzeroupper(); 2264 save_native_result(masm, ret_type, stack_slots); 2265 __ mov(c_rarg0, r15_thread); 2266 __ mov(r12, rsp); // remember sp 2267 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2268 __ andptr(rsp, -16); // align stack as required by ABI 2269 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 2270 __ mov(rsp, r12); // restore sp 2271 __ reinit_heapbase(); 2272 // Restore any method result value 2273 restore_native_result(masm, ret_type, stack_slots); 2274 __ bind(Continue); 2275 } 2276 2277 // change thread state 2278 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 2279 2280 if (method->is_object_wait0()) { 2281 // Check preemption for Object.wait() 2282 __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset())); 2283 __ cmpptr(rscratch1, NULL_WORD); 2284 __ jccb(Assembler::equal, native_return); 2285 __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD); 2286 __ jmp(rscratch1); 2287 __ bind(native_return); 2288 2289 intptr_t the_pc = (intptr_t) __ pc(); 2290 oop_maps->add_gc_map(the_pc - start, map); 2291 } 2292 2293 2294 Label reguard; 2295 Label reguard_done; 2296 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 2297 __ jcc(Assembler::equal, reguard); 2298 __ bind(reguard_done); 2299 2300 // native result if any is live 2301 2302 // Unlock 2303 Label slow_path_unlock; 2304 Label unlock_done; 2305 if (method->is_synchronized()) { 2306 2307 Label fast_done; 2308 2309 // Get locked oop from the handle we passed to jni 2310 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2311 2312 // Must save rax if it is live now because cmpxchg must use it 2313 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2314 save_native_result(masm, ret_type, stack_slots); 2315 } 2316 2317 __ lightweight_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock); 2318 2319 // slow path re-enters here 2320 __ bind(unlock_done); 2321 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2322 restore_native_result(masm, ret_type, stack_slots); 2323 } 2324 2325 __ bind(fast_done); 2326 } 2327 if (DTraceMethodProbes) { 2328 save_native_result(masm, ret_type, stack_slots); 2329 __ mov_metadata(c_rarg1, method()); 2330 __ call_VM_leaf( 2331 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit), 2332 r15_thread, c_rarg1); 2333 restore_native_result(masm, ret_type, stack_slots); 2334 } 2335 2336 __ reset_last_Java_frame(false); 2337 2338 // Unbox oop result, e.g. JNIHandles::resolve value. 2339 if (is_reference_type(ret_type)) { 2340 __ resolve_jobject(rax /* value */, 2341 rcx /* tmp */); 2342 } 2343 2344 if (CheckJNICalls) { 2345 // clear_pending_jni_exception_check 2346 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD); 2347 } 2348 2349 // reset handle block 2350 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset())); 2351 __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD); 2352 2353 // pop our frame 2354 2355 __ leave(); 2356 2357 #if INCLUDE_JFR 2358 // We need to do a poll test after unwind in case the sampler 2359 // managed to sample the native frame after returning to Java. 2360 Label L_return; 2361 address poll_test_pc = __ pc(); 2362 __ relocate(relocInfo::poll_return_type); 2363 __ testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit()); 2364 __ jccb(Assembler::zero, L_return); 2365 __ lea(rscratch1, InternalAddress(poll_test_pc)); 2366 __ movptr(Address(r15_thread, JavaThread::saved_exception_pc_offset()), rscratch1); 2367 assert(SharedRuntime::polling_page_return_handler_blob() != nullptr, 2368 "polling page return stub not created yet"); 2369 address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point(); 2370 __ jump(RuntimeAddress(stub)); 2371 __ bind(L_return); 2372 #endif // INCLUDE_JFR 2373 2374 // Any exception pending? 2375 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2376 __ jcc(Assembler::notEqual, exception_pending); 2377 2378 // Return 2379 2380 __ ret(0); 2381 2382 // Unexpected paths are out of line and go here 2383 2384 // forward the exception 2385 __ bind(exception_pending); 2386 2387 // and forward the exception 2388 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 2389 2390 // Slow path locking & unlocking 2391 if (method->is_synchronized()) { 2392 2393 // BEGIN Slow path lock 2394 __ bind(slow_path_lock); 2395 2396 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM 2397 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2398 2399 // protect the args we've loaded 2400 save_args(masm, total_c_args, c_arg, out_regs); 2401 2402 __ mov(c_rarg0, obj_reg); 2403 __ mov(c_rarg1, lock_reg); 2404 __ mov(c_rarg2, r15_thread); 2405 2406 // Not a leaf but we have last_Java_frame setup as we want. 2407 // We don't want to unmount in case of contention since that would complicate preserving 2408 // the arguments that had already been marshalled into the native convention. So we force 2409 // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame()) 2410 // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack. 2411 __ push_cont_fastpath(); 2412 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3); 2413 __ pop_cont_fastpath(); 2414 restore_args(masm, total_c_args, c_arg, out_regs); 2415 2416 #ifdef ASSERT 2417 { Label L; 2418 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2419 __ jcc(Assembler::equal, L); 2420 __ stop("no pending exception allowed on exit from monitorenter"); 2421 __ bind(L); 2422 } 2423 #endif 2424 __ jmp(lock_done); 2425 2426 // END Slow path lock 2427 2428 // BEGIN Slow path unlock 2429 __ bind(slow_path_unlock); 2430 2431 // If we haven't already saved the native result we must save it now as xmm registers 2432 // are still exposed. 2433 __ vzeroupper(); 2434 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2435 save_native_result(masm, ret_type, stack_slots); 2436 } 2437 2438 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2439 2440 __ mov(c_rarg0, obj_reg); 2441 __ mov(c_rarg2, r15_thread); 2442 __ mov(r12, rsp); // remember sp 2443 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2444 __ andptr(rsp, -16); // align stack as required by ABI 2445 2446 // Save pending exception around call to VM (which contains an EXCEPTION_MARK) 2447 // NOTE that obj_reg == rbx currently 2448 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset()))); 2449 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2450 2451 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2452 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C))); 2453 __ mov(rsp, r12); // restore sp 2454 __ reinit_heapbase(); 2455 #ifdef ASSERT 2456 { 2457 Label L; 2458 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2459 __ jcc(Assembler::equal, L); 2460 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C"); 2461 __ bind(L); 2462 } 2463 #endif /* ASSERT */ 2464 2465 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx); 2466 2467 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2468 restore_native_result(masm, ret_type, stack_slots); 2469 } 2470 __ jmp(unlock_done); 2471 2472 // END Slow path unlock 2473 2474 } // synchronized 2475 2476 // SLOW PATH Reguard the stack if needed 2477 2478 __ bind(reguard); 2479 __ vzeroupper(); 2480 save_native_result(masm, ret_type, stack_slots); 2481 __ mov(r12, rsp); // remember sp 2482 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2483 __ andptr(rsp, -16); // align stack as required by ABI 2484 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 2485 __ mov(rsp, r12); // restore sp 2486 __ reinit_heapbase(); 2487 restore_native_result(masm, ret_type, stack_slots); 2488 // and continue 2489 __ jmp(reguard_done); 2490 2491 2492 2493 __ flush(); 2494 2495 nmethod *nm = nmethod::new_native_nmethod(method, 2496 compile_id, 2497 masm->code(), 2498 vep_offset, 2499 frame_complete, 2500 stack_slots / VMRegImpl::slots_per_word, 2501 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)), 2502 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size), 2503 oop_maps); 2504 2505 return nm; 2506 } 2507 2508 // this function returns the adjust size (in number of words) to a c2i adapter 2509 // activation for use during deoptimization 2510 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) { 2511 return (callee_locals - callee_parameters) * Interpreter::stackElementWords; 2512 } 2513 2514 2515 uint SharedRuntime::out_preserve_stack_slots() { 2516 return 0; 2517 } 2518 2519 2520 // Number of stack slots between incoming argument block and the start of 2521 // a new frame. The PROLOG must add this many slots to the stack. The 2522 // EPILOG must remove this many slots. amd64 needs two slots for 2523 // return address. 2524 uint SharedRuntime::in_preserve_stack_slots() { 2525 return 4 + 2 * VerifyStackAtCalls; 2526 } 2527 2528 VMReg SharedRuntime::thread_register() { 2529 return r15_thread->as_VMReg(); 2530 } 2531 2532 //------------------------------generate_deopt_blob---------------------------- 2533 void SharedRuntime::generate_deopt_blob() { 2534 // Allocate space for the code 2535 ResourceMark rm; 2536 // Setup code generation tools 2537 int pad = 0; 2538 if (UseAVX > 2) { 2539 pad += 1024; 2540 } 2541 if (UseAPX) { 2542 pad += 1024; 2543 } 2544 #if INCLUDE_JVMCI 2545 if (EnableJVMCI) { 2546 pad += 512; // Increase the buffer size when compiling for JVMCI 2547 } 2548 #endif 2549 const char* name = SharedRuntime::stub_name(StubId::shared_deopt_id); 2550 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id); 2551 if (blob != nullptr) { 2552 _deopt_blob = blob->as_deoptimization_blob(); 2553 return; 2554 } 2555 2556 CodeBuffer buffer(name, 2560+pad, 1024); 2557 MacroAssembler* masm = new MacroAssembler(&buffer); 2558 int frame_size_in_words; 2559 OopMap* map = nullptr; 2560 OopMapSet *oop_maps = new OopMapSet(); 2561 2562 // ------------- 2563 // This code enters when returning to a de-optimized nmethod. A return 2564 // address has been pushed on the stack, and return values are in 2565 // registers. 2566 // If we are doing a normal deopt then we were called from the patched 2567 // nmethod from the point we returned to the nmethod. So the return 2568 // address on the stack is wrong by NativeCall::instruction_size 2569 // We will adjust the value so it looks like we have the original return 2570 // address on the stack (like when we eagerly deoptimized). 2571 // In the case of an exception pending when deoptimizing, we enter 2572 // with a return address on the stack that points after the call we patched 2573 // into the exception handler. We have the following register state from, 2574 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp). 2575 // rax: exception oop 2576 // rbx: exception handler 2577 // rdx: throwing pc 2578 // So in this case we simply jam rdx into the useless return address and 2579 // the stack looks just like we want. 2580 // 2581 // At this point we need to de-opt. We save the argument return 2582 // registers. We call the first C routine, fetch_unroll_info(). This 2583 // routine captures the return values and returns a structure which 2584 // describes the current frame size and the sizes of all replacement frames. 2585 // The current frame is compiled code and may contain many inlined 2586 // functions, each with their own JVM state. We pop the current frame, then 2587 // push all the new frames. Then we call the C routine unpack_frames() to 2588 // populate these frames. Finally unpack_frames() returns us the new target 2589 // address. Notice that callee-save registers are BLOWN here; they have 2590 // already been captured in the vframeArray at the time the return PC was 2591 // patched. 2592 address start = __ pc(); 2593 Label cont; 2594 2595 // Prolog for non exception case! 2596 2597 // Save everything in sight. 2598 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2599 2600 // Normal deoptimization. Save exec mode for unpack_frames. 2601 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved 2602 __ jmp(cont); 2603 2604 int reexecute_offset = __ pc() - start; 2605 #if INCLUDE_JVMCI && !defined(COMPILER1) 2606 if (UseJVMCICompiler) { 2607 // JVMCI does not use this kind of deoptimization 2608 __ should_not_reach_here(); 2609 } 2610 #endif 2611 2612 // Reexecute case 2613 // return address is the pc describes what bci to do re-execute at 2614 2615 // No need to update map as each call to save_live_registers will produce identical oopmap 2616 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2617 2618 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved 2619 __ jmp(cont); 2620 2621 #if INCLUDE_JVMCI 2622 Label after_fetch_unroll_info_call; 2623 int implicit_exception_uncommon_trap_offset = 0; 2624 int uncommon_trap_offset = 0; 2625 2626 if (EnableJVMCI) { 2627 implicit_exception_uncommon_trap_offset = __ pc() - start; 2628 2629 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset()))); 2630 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD); 2631 2632 uncommon_trap_offset = __ pc() - start; 2633 2634 // Save everything in sight. 2635 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2636 // fetch_unroll_info needs to call last_java_frame() 2637 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2638 2639 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset()))); 2640 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1); 2641 2642 __ movl(r14, Deoptimization::Unpack_reexecute); 2643 __ mov(c_rarg0, r15_thread); 2644 __ movl(c_rarg2, r14); // exec mode 2645 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2646 oop_maps->add_gc_map( __ pc()-start, map->deep_copy()); 2647 2648 __ reset_last_Java_frame(false); 2649 2650 __ jmp(after_fetch_unroll_info_call); 2651 } // EnableJVMCI 2652 #endif // INCLUDE_JVMCI 2653 2654 int exception_offset = __ pc() - start; 2655 2656 // Prolog for exception case 2657 2658 // all registers are dead at this entry point, except for rax, and 2659 // rdx which contain the exception oop and exception pc 2660 // respectively. Set them in TLS and fall thru to the 2661 // unpack_with_exception_in_tls entry point. 2662 2663 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 2664 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax); 2665 2666 int exception_in_tls_offset = __ pc() - start; 2667 2668 // new implementation because exception oop is now passed in JavaThread 2669 2670 // Prolog for exception case 2671 // All registers must be preserved because they might be used by LinearScan 2672 // Exceptiop oop and throwing PC are passed in JavaThread 2673 // tos: stack at point of call to method that threw the exception (i.e. only 2674 // args are on the stack, no return address) 2675 2676 // make room on stack for the return address 2677 // It will be patched later with the throwing pc. The correct value is not 2678 // available now because loading it from memory would destroy registers. 2679 __ push(0); 2680 2681 // Save everything in sight. 2682 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2683 2684 // Now it is safe to overwrite any register 2685 2686 // Deopt during an exception. Save exec mode for unpack_frames. 2687 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved 2688 2689 // load throwing pc from JavaThread and patch it as the return address 2690 // of the current frame. Then clear the field in JavaThread 2691 2692 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2693 __ movptr(Address(rbp, wordSize), rdx); 2694 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2695 2696 #ifdef ASSERT 2697 // verify that there is really an exception oop in JavaThread 2698 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2699 __ verify_oop(rax); 2700 2701 // verify that there is no pending exception 2702 Label no_pending_exception; 2703 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 2704 __ testptr(rax, rax); 2705 __ jcc(Assembler::zero, no_pending_exception); 2706 __ stop("must not have pending exception here"); 2707 __ bind(no_pending_exception); 2708 #endif 2709 2710 __ bind(cont); 2711 2712 // Call C code. Need thread and this frame, but NOT official VM entry 2713 // crud. We cannot block on this call, no GC can happen. 2714 // 2715 // UnrollBlock* fetch_unroll_info(JavaThread* thread) 2716 2717 // fetch_unroll_info needs to call last_java_frame(). 2718 2719 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2720 #ifdef ASSERT 2721 { Label L; 2722 __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD); 2723 __ jcc(Assembler::equal, L); 2724 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared"); 2725 __ bind(L); 2726 } 2727 #endif // ASSERT 2728 __ mov(c_rarg0, r15_thread); 2729 __ movl(c_rarg1, r14); // exec_mode 2730 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info))); 2731 2732 // Need to have an oopmap that tells fetch_unroll_info where to 2733 // find any register it might need. 2734 oop_maps->add_gc_map(__ pc() - start, map); 2735 2736 __ reset_last_Java_frame(false); 2737 2738 #if INCLUDE_JVMCI 2739 if (EnableJVMCI) { 2740 __ bind(after_fetch_unroll_info_call); 2741 } 2742 #endif 2743 2744 // Load UnrollBlock* into rdi 2745 __ mov(rdi, rax); 2746 2747 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset())); 2748 Label noException; 2749 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending? 2750 __ jcc(Assembler::notEqual, noException); 2751 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2752 // QQQ this is useless it was null above 2753 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2754 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD); 2755 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2756 2757 __ verify_oop(rax); 2758 2759 // Overwrite the result registers with the exception results. 2760 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2761 // I think this is useless 2762 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx); 2763 2764 __ bind(noException); 2765 2766 // Only register save data is on the stack. 2767 // Now restore the result registers. Everything else is either dead 2768 // or captured in the vframeArray. 2769 RegisterSaver::restore_result_registers(masm); 2770 2771 // All of the register save area has been popped of the stack. Only the 2772 // return address remains. 2773 2774 // Pop all the frames we must move/replace. 2775 // 2776 // Frame picture (youngest to oldest) 2777 // 1: self-frame (no frame link) 2778 // 2: deopting frame (no frame link) 2779 // 3: caller of deopting frame (could be compiled/interpreted). 2780 // 2781 // Note: by leaving the return address of self-frame on the stack 2782 // and using the size of frame 2 to adjust the stack 2783 // when we are done the return to frame 3 will still be on the stack. 2784 2785 // Pop deoptimized frame 2786 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset())); 2787 __ addptr(rsp, rcx); 2788 2789 // rsp should be pointing at the return address to the caller (3) 2790 2791 // Pick up the initial fp we should save 2792 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2793 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset())); 2794 2795 #ifdef ASSERT 2796 // Compilers generate code that bang the stack by as much as the 2797 // interpreter would need. So this stack banging should never 2798 // trigger a fault. Verify that it does not on non product builds. 2799 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset())); 2800 __ bang_stack_size(rbx, rcx); 2801 #endif 2802 2803 // Load address of array of frame pcs into rcx 2804 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset())); 2805 2806 // Trash the old pc 2807 __ addptr(rsp, wordSize); 2808 2809 // Load address of array of frame sizes into rsi 2810 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset())); 2811 2812 // Load counter into rdx 2813 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset())); 2814 2815 // Now adjust the caller's stack to make up for the extra locals 2816 // but record the original sp so that we can save it in the skeletal interpreter 2817 // frame and the stack walking of interpreter_sender will get the unextended sp 2818 // value and not the "real" sp value. 2819 2820 const Register sender_sp = r8; 2821 2822 __ mov(sender_sp, rsp); 2823 __ movl(rbx, Address(rdi, 2824 Deoptimization::UnrollBlock:: 2825 caller_adjustment_offset())); 2826 __ subptr(rsp, rbx); 2827 2828 // Push interpreter frames in a loop 2829 Label loop; 2830 __ bind(loop); 2831 __ movptr(rbx, Address(rsi, 0)); // Load frame size 2832 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand 2833 __ pushptr(Address(rcx, 0)); // Save return address 2834 __ enter(); // Save old & set new ebp 2835 __ subptr(rsp, rbx); // Prolog 2836 // This value is corrected by layout_activation_impl 2837 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD); 2838 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable 2839 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 2840 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 2841 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 2842 __ decrementl(rdx); // Decrement counter 2843 __ jcc(Assembler::notZero, loop); 2844 __ pushptr(Address(rcx, 0)); // Save final return address 2845 2846 // Re-push self-frame 2847 __ enter(); // Save old & set new ebp 2848 2849 // Allocate a full sized register save area. 2850 // Return address and rbp are in place, so we allocate two less words. 2851 __ subptr(rsp, (frame_size_in_words - 2) * wordSize); 2852 2853 // Restore frame locals after moving the frame 2854 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0); 2855 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2856 2857 // Call C code. Need thread but NOT official VM entry 2858 // crud. We cannot block on this call, no GC can happen. Call should 2859 // restore return values to their stack-slots with the new SP. 2860 // 2861 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode) 2862 2863 // Use rbp because the frames look interpreted now 2864 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 2865 // Don't need the precise return PC here, just precise enough to point into this code blob. 2866 address the_pc = __ pc(); 2867 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1); 2868 2869 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI 2870 __ mov(c_rarg0, r15_thread); 2871 __ movl(c_rarg1, r14); // second arg: exec_mode 2872 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 2873 // Revert SP alignment after call since we're going to do some SP relative addressing below 2874 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset())); 2875 2876 // Set an oopmap for the call site 2877 // Use the same PC we used for the last java frame 2878 oop_maps->add_gc_map(the_pc - start, 2879 new OopMap( frame_size_in_words, 0 )); 2880 2881 // Clear fp AND pc 2882 __ reset_last_Java_frame(true); 2883 2884 // Collect return values 2885 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes())); 2886 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes())); 2887 // I think this is useless (throwing pc?) 2888 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes())); 2889 2890 // Pop self-frame. 2891 __ leave(); // Epilog 2892 2893 // Jump to interpreter 2894 __ ret(0); 2895 2896 // Make sure all code is generated 2897 masm->flush(); 2898 2899 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words); 2900 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset); 2901 #if INCLUDE_JVMCI 2902 if (EnableJVMCI) { 2903 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset); 2904 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset); 2905 } 2906 #endif 2907 2908 AOTCodeCache::store_code_blob(*_deopt_blob, AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id); 2909 } 2910 2911 //------------------------------generate_handler_blob------ 2912 // 2913 // Generate a special Compile2Runtime blob that saves all registers, 2914 // and setup oopmap. 2915 // 2916 SafepointBlob* SharedRuntime::generate_handler_blob(StubId id, address call_ptr) { 2917 assert(StubRoutines::forward_exception_entry() != nullptr, 2918 "must be generated before"); 2919 assert(is_polling_page_id(id), "expected a polling page stub id"); 2920 2921 // Allocate space for the code. Setup code generation tools. 2922 const char* name = SharedRuntime::stub_name(id); 2923 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id)); 2924 if (blob != nullptr) { 2925 return blob->as_safepoint_blob(); 2926 } 2927 2928 ResourceMark rm; 2929 OopMapSet *oop_maps = new OopMapSet(); 2930 OopMap* map; 2931 CodeBuffer buffer(name, 2548, 1024); 2932 MacroAssembler* masm = new MacroAssembler(&buffer); 2933 2934 address start = __ pc(); 2935 address call_pc = nullptr; 2936 int frame_size_in_words; 2937 bool cause_return = (id == StubId::shared_polling_page_return_handler_id); 2938 bool save_wide_vectors = (id == StubId::shared_polling_page_vectors_safepoint_handler_id); 2939 2940 // Make room for return address (or push it again) 2941 if (!cause_return) { 2942 __ push(rbx); 2943 } 2944 2945 // Save registers, fpu state, and flags 2946 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors); 2947 2948 // The following is basically a call_VM. However, we need the precise 2949 // address of the call in order to generate an oopmap. Hence, we do all the 2950 // work ourselves. 2951 2952 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next: 2953 2954 // The return address must always be correct so that frame constructor never 2955 // sees an invalid pc. 2956 2957 if (!cause_return) { 2958 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack. 2959 // Additionally, rbx is a callee saved register and we can look at it later to determine 2960 // if someone changed the return address for us! 2961 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset())); 2962 __ movptr(Address(rbp, wordSize), rbx); 2963 } 2964 2965 // Do the call 2966 __ mov(c_rarg0, r15_thread); 2967 __ call(RuntimeAddress(call_ptr)); 2968 2969 // Set an oopmap for the call site. This oopmap will map all 2970 // oop-registers and debug-info registers as callee-saved. This 2971 // will allow deoptimization at this safepoint to find all possible 2972 // debug-info recordings, as well as let GC find all oops. 2973 2974 oop_maps->add_gc_map( __ pc() - start, map); 2975 2976 Label noException; 2977 2978 __ reset_last_Java_frame(false); 2979 2980 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 2981 __ jcc(Assembler::equal, noException); 2982 2983 // Exception pending 2984 2985 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 2986 2987 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 2988 2989 // No exception case 2990 __ bind(noException); 2991 2992 Label no_adjust; 2993 #ifdef ASSERT 2994 Label bail; 2995 #endif 2996 if (!cause_return) { 2997 Label no_prefix, not_special, check_rex_prefix; 2998 2999 // If our stashed return pc was modified by the runtime we avoid touching it 3000 __ cmpptr(rbx, Address(rbp, wordSize)); 3001 __ jcc(Assembler::notEqual, no_adjust); 3002 3003 // Skip over the poll instruction. 3004 // See NativeInstruction::is_safepoint_poll() 3005 // Possible encodings: 3006 // 85 00 test %eax,(%rax) 3007 // 85 01 test %eax,(%rcx) 3008 // 85 02 test %eax,(%rdx) 3009 // 85 03 test %eax,(%rbx) 3010 // 85 06 test %eax,(%rsi) 3011 // 85 07 test %eax,(%rdi) 3012 // 3013 // 41 85 00 test %eax,(%r8) 3014 // 41 85 01 test %eax,(%r9) 3015 // 41 85 02 test %eax,(%r10) 3016 // 41 85 03 test %eax,(%r11) 3017 // 41 85 06 test %eax,(%r14) 3018 // 41 85 07 test %eax,(%r15) 3019 // 3020 // 85 04 24 test %eax,(%rsp) 3021 // 41 85 04 24 test %eax,(%r12) 3022 // 85 45 00 test %eax,0x0(%rbp) 3023 // 41 85 45 00 test %eax,0x0(%r13) 3024 // 3025 // Notes: 3026 // Format of legacy MAP0 test instruction:- 3027 // [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32] 3028 // o For safepoint polling instruction "test %eax,(%rax)", encoding of first register 3029 // operand and base register of memory operand is b/w [0-8), hence we do not require 3030 // additional REX prefix where REX.B bit stores MSB bit of register encoding, which 3031 // is why two bytes encoding is sufficient here. 3032 // o For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE 3033 // register of memory operand is 1000, thus we need additional REX prefix in this case, 3034 // there by adding additional byte to instruction encoding. 3035 // o In case BASE register is one of the 32 extended GPR registers available only on targets 3036 // supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold 3037 // most significant two bits of 5 bit register encoding. 3038 3039 if (VM_Version::supports_apx_f()) { 3040 __ cmpb(Address(rbx, 0), Assembler::REX2); 3041 __ jccb(Assembler::notEqual, check_rex_prefix); 3042 __ addptr(rbx, 2); 3043 __ bind(check_rex_prefix); 3044 } 3045 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix); 3046 __ jccb(Assembler::notEqual, no_prefix); 3047 __ addptr(rbx, 1); 3048 __ bind(no_prefix); 3049 #ifdef ASSERT 3050 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below 3051 #endif 3052 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values: 3053 // r12/rsp 0x04 3054 // r13/rbp 0x05 3055 __ movzbq(rcx, Address(rbx, 1)); 3056 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05 3057 __ subptr(rcx, 4); // looking for 0x00 .. 0x01 3058 __ cmpptr(rcx, 1); 3059 __ jccb(Assembler::above, not_special); 3060 __ addptr(rbx, 1); 3061 __ bind(not_special); 3062 #ifdef ASSERT 3063 // Verify the correct encoding of the poll we're about to skip. 3064 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl); 3065 __ jcc(Assembler::notEqual, bail); 3066 // Mask out the modrm bits 3067 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask); 3068 // rax encodes to 0, so if the bits are nonzero it's incorrect 3069 __ jcc(Assembler::notZero, bail); 3070 #endif 3071 // Adjust return pc forward to step over the safepoint poll instruction 3072 __ addptr(rbx, 2); 3073 __ movptr(Address(rbp, wordSize), rbx); 3074 } 3075 3076 __ bind(no_adjust); 3077 // Normal exit, restore registers and exit. 3078 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3079 __ ret(0); 3080 3081 #ifdef ASSERT 3082 __ bind(bail); 3083 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected"); 3084 #endif 3085 3086 // Make sure all code is generated 3087 masm->flush(); 3088 3089 // Fill-out other meta info 3090 SafepointBlob* sp_blob = SafepointBlob::create(&buffer, oop_maps, frame_size_in_words); 3091 3092 AOTCodeCache::store_code_blob(*sp_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id)); 3093 return sp_blob; 3094 } 3095 3096 // 3097 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss 3098 // 3099 // Generate a stub that calls into vm to find out the proper destination 3100 // of a java call. All the argument registers are live at this point 3101 // but since this is generic code we don't know what they are and the caller 3102 // must do any gc of the args. 3103 // 3104 RuntimeStub* SharedRuntime::generate_resolve_blob(StubId id, address destination) { 3105 assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before"); 3106 assert(is_resolve_id(id), "expected a resolve stub id"); 3107 3108 const char* name = SharedRuntime::stub_name(id); 3109 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id)); 3110 if (blob != nullptr) { 3111 return blob->as_runtime_stub(); 3112 } 3113 3114 // allocate space for the code 3115 ResourceMark rm; 3116 CodeBuffer buffer(name, 1552, 512); 3117 MacroAssembler* masm = new MacroAssembler(&buffer); 3118 3119 int frame_size_in_words; 3120 3121 OopMapSet *oop_maps = new OopMapSet(); 3122 OopMap* map = nullptr; 3123 3124 int start = __ offset(); 3125 3126 // No need to save vector registers since they are caller-saved anyway. 3127 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false); 3128 3129 int frame_complete = __ offset(); 3130 3131 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 3132 3133 __ mov(c_rarg0, r15_thread); 3134 3135 __ call(RuntimeAddress(destination)); 3136 3137 3138 // Set an oopmap for the call site. 3139 // We need this not only for callee-saved registers, but also for volatile 3140 // registers that the compiler might be keeping live across a safepoint. 3141 3142 oop_maps->add_gc_map( __ offset() - start, map); 3143 3144 // rax contains the address we are going to jump to assuming no exception got installed 3145 3146 // clear last_Java_sp 3147 __ reset_last_Java_frame(false); 3148 // check for pending exceptions 3149 Label pending; 3150 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3151 __ jcc(Assembler::notEqual, pending); 3152 3153 // get the returned Method* 3154 __ get_vm_result_metadata(rbx); 3155 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx); 3156 3157 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 3158 3159 RegisterSaver::restore_live_registers(masm); 3160 3161 // We are back to the original state on entry and ready to go. 3162 3163 __ jmp(rax); 3164 3165 // Pending exception after the safepoint 3166 3167 __ bind(pending); 3168 3169 RegisterSaver::restore_live_registers(masm); 3170 3171 // exception pending => remove activation and forward to exception handler 3172 3173 __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD); 3174 3175 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 3176 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3177 3178 // ------------- 3179 // make sure all code is generated 3180 masm->flush(); 3181 3182 // return the blob 3183 // frame_size_words or bytes?? 3184 RuntimeStub* rs_blob = RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); 3185 3186 AOTCodeCache::store_code_blob(*rs_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id)); 3187 return rs_blob; 3188 } 3189 3190 // Continuation point for throwing of implicit exceptions that are 3191 // not handled in the current activation. Fabricates an exception 3192 // oop and initiates normal exception dispatching in this 3193 // frame. Since we need to preserve callee-saved values (currently 3194 // only for C2, but done for C1 as well) we need a callee-saved oop 3195 // map and therefore have to make these stubs into RuntimeStubs 3196 // rather than BufferBlobs. If the compiler needs all registers to 3197 // be preserved between the fault point and the exception handler 3198 // then it must assume responsibility for that in 3199 // AbstractCompiler::continuation_for_implicit_null_exception or 3200 // continuation_for_implicit_division_by_zero_exception. All other 3201 // implicit exceptions (e.g., NullPointerException or 3202 // AbstractMethodError on entry) are either at call sites or 3203 // otherwise assume that stack unwinding will be initiated, so 3204 // caller saved registers were assumed volatile in the compiler. 3205 RuntimeStub* SharedRuntime::generate_throw_exception(StubId id, address runtime_entry) { 3206 assert(is_throw_id(id), "expected a throw stub id"); 3207 3208 const char* name = SharedRuntime::stub_name(id); 3209 3210 // Information about frame layout at time of blocking runtime call. 3211 // Note that we only have to preserve callee-saved registers since 3212 // the compilers are responsible for supplying a continuation point 3213 // if they expect all registers to be preserved. 3214 enum layout { 3215 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt, 3216 rbp_off2, 3217 return_off, 3218 return_off2, 3219 framesize // inclusive of return address 3220 }; 3221 3222 int insts_size = 512; 3223 int locs_size = 64; 3224 3225 const char* timer_msg = "SharedRuntime generate_throw_exception"; 3226 TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime)); 3227 3228 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id)); 3229 if (blob != nullptr) { 3230 return blob->as_runtime_stub(); 3231 } 3232 3233 ResourceMark rm; 3234 CodeBuffer code(name, insts_size, locs_size); 3235 OopMapSet* oop_maps = new OopMapSet(); 3236 MacroAssembler* masm = new MacroAssembler(&code); 3237 3238 address start = __ pc(); 3239 3240 // This is an inlined and slightly modified version of call_VM 3241 // which has the ability to fetch the return PC out of 3242 // thread-local storage and also sets up last_Java_sp slightly 3243 // differently than the real call_VM 3244 3245 __ enter(); // required for proper stackwalking of RuntimeStub frame 3246 3247 assert(is_even(framesize/2), "sp not 16-byte aligned"); 3248 3249 // return address and rbp are already in place 3250 __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog 3251 3252 int frame_complete = __ pc() - start; 3253 3254 // Set up last_Java_sp and last_Java_fp 3255 address the_pc = __ pc(); 3256 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 3257 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack 3258 3259 // Call runtime 3260 __ movptr(c_rarg0, r15_thread); 3261 BLOCK_COMMENT("call runtime_entry"); 3262 __ call(RuntimeAddress(runtime_entry)); 3263 3264 // Generate oop map 3265 OopMap* map = new OopMap(framesize, 0); 3266 3267 oop_maps->add_gc_map(the_pc - start, map); 3268 3269 __ reset_last_Java_frame(true); 3270 3271 __ leave(); // required for proper stackwalking of RuntimeStub frame 3272 3273 // check for pending exceptions 3274 #ifdef ASSERT 3275 Label L; 3276 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3277 __ jcc(Assembler::notEqual, L); 3278 __ should_not_reach_here(); 3279 __ bind(L); 3280 #endif // ASSERT 3281 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3282 3283 3284 // codeBlob framesize is in words (not VMRegImpl::slot_size) 3285 RuntimeStub* stub = 3286 RuntimeStub::new_runtime_stub(name, 3287 &code, 3288 frame_complete, 3289 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3290 oop_maps, false); 3291 AOTCodeCache::store_code_blob(*stub, AOTCodeEntry::SharedBlob, StubInfo::blob(id)); 3292 3293 return stub; 3294 } 3295 3296 //------------------------------Montgomery multiplication------------------------ 3297 // 3298 3299 #ifndef _WINDOWS 3300 3301 // Subtract 0:b from carry:a. Return carry. 3302 static julong 3303 sub(julong a[], julong b[], julong carry, long len) { 3304 long long i = 0, cnt = len; 3305 julong tmp; 3306 asm volatile("clc; " 3307 "0: ; " 3308 "mov (%[b], %[i], 8), %[tmp]; " 3309 "sbb %[tmp], (%[a], %[i], 8); " 3310 "inc %[i]; dec %[cnt]; " 3311 "jne 0b; " 3312 "mov %[carry], %[tmp]; sbb $0, %[tmp]; " 3313 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp) 3314 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry) 3315 : "memory"); 3316 return tmp; 3317 } 3318 3319 // Multiply (unsigned) Long A by Long B, accumulating the double- 3320 // length result into the accumulator formed of T0, T1, and T2. 3321 #define MACC(A, B, T0, T1, T2) \ 3322 do { \ 3323 unsigned long hi, lo; \ 3324 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3325 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3326 : "r"(A), "a"(B) : "cc"); \ 3327 } while(0) 3328 3329 // As above, but add twice the double-length result into the 3330 // accumulator. 3331 #define MACC2(A, B, T0, T1, T2) \ 3332 do { \ 3333 unsigned long hi, lo; \ 3334 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \ 3335 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3336 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3337 : "r"(A), "a"(B) : "cc"); \ 3338 } while(0) 3339 3340 #else //_WINDOWS 3341 3342 static julong 3343 sub(julong a[], julong b[], julong carry, long len) { 3344 long i; 3345 julong tmp; 3346 unsigned char c = 1; 3347 for (i = 0; i < len; i++) { 3348 c = _addcarry_u64(c, a[i], ~b[i], &tmp); 3349 a[i] = tmp; 3350 } 3351 c = _addcarry_u64(c, carry, ~0, &tmp); 3352 return tmp; 3353 } 3354 3355 // Multiply (unsigned) Long A by Long B, accumulating the double- 3356 // length result into the accumulator formed of T0, T1, and T2. 3357 #define MACC(A, B, T0, T1, T2) \ 3358 do { \ 3359 julong hi, lo; \ 3360 lo = _umul128(A, B, &hi); \ 3361 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3362 c = _addcarry_u64(c, hi, T1, &T1); \ 3363 _addcarry_u64(c, T2, 0, &T2); \ 3364 } while(0) 3365 3366 // As above, but add twice the double-length result into the 3367 // accumulator. 3368 #define MACC2(A, B, T0, T1, T2) \ 3369 do { \ 3370 julong hi, lo; \ 3371 lo = _umul128(A, B, &hi); \ 3372 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3373 c = _addcarry_u64(c, hi, T1, &T1); \ 3374 _addcarry_u64(c, T2, 0, &T2); \ 3375 c = _addcarry_u64(0, lo, T0, &T0); \ 3376 c = _addcarry_u64(c, hi, T1, &T1); \ 3377 _addcarry_u64(c, T2, 0, &T2); \ 3378 } while(0) 3379 3380 #endif //_WINDOWS 3381 3382 // Fast Montgomery multiplication. The derivation of the algorithm is 3383 // in A Cryptographic Library for the Motorola DSP56000, 3384 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3385 3386 static void NOINLINE 3387 montgomery_multiply(julong a[], julong b[], julong n[], 3388 julong m[], julong inv, int len) { 3389 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3390 int i; 3391 3392 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply"); 3393 3394 for (i = 0; i < len; i++) { 3395 int j; 3396 for (j = 0; j < i; j++) { 3397 MACC(a[j], b[i-j], t0, t1, t2); 3398 MACC(m[j], n[i-j], t0, t1, t2); 3399 } 3400 MACC(a[i], b[0], t0, t1, t2); 3401 m[i] = t0 * inv; 3402 MACC(m[i], n[0], t0, t1, t2); 3403 3404 assert(t0 == 0, "broken Montgomery multiply"); 3405 3406 t0 = t1; t1 = t2; t2 = 0; 3407 } 3408 3409 for (i = len; i < 2*len; i++) { 3410 int j; 3411 for (j = i-len+1; j < len; j++) { 3412 MACC(a[j], b[i-j], t0, t1, t2); 3413 MACC(m[j], n[i-j], t0, t1, t2); 3414 } 3415 m[i-len] = t0; 3416 t0 = t1; t1 = t2; t2 = 0; 3417 } 3418 3419 while (t0) 3420 t0 = sub(m, n, t0, len); 3421 } 3422 3423 // Fast Montgomery squaring. This uses asymptotically 25% fewer 3424 // multiplies so it should be up to 25% faster than Montgomery 3425 // multiplication. However, its loop control is more complex and it 3426 // may actually run slower on some machines. 3427 3428 static void NOINLINE 3429 montgomery_square(julong a[], julong n[], 3430 julong m[], julong inv, int len) { 3431 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3432 int i; 3433 3434 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square"); 3435 3436 for (i = 0; i < len; i++) { 3437 int j; 3438 int end = (i+1)/2; 3439 for (j = 0; j < end; j++) { 3440 MACC2(a[j], a[i-j], t0, t1, t2); 3441 MACC(m[j], n[i-j], t0, t1, t2); 3442 } 3443 if ((i & 1) == 0) { 3444 MACC(a[j], a[j], t0, t1, t2); 3445 } 3446 for (; j < i; j++) { 3447 MACC(m[j], n[i-j], t0, t1, t2); 3448 } 3449 m[i] = t0 * inv; 3450 MACC(m[i], n[0], t0, t1, t2); 3451 3452 assert(t0 == 0, "broken Montgomery square"); 3453 3454 t0 = t1; t1 = t2; t2 = 0; 3455 } 3456 3457 for (i = len; i < 2*len; i++) { 3458 int start = i-len+1; 3459 int end = start + (len - start)/2; 3460 int j; 3461 for (j = start; j < end; j++) { 3462 MACC2(a[j], a[i-j], t0, t1, t2); 3463 MACC(m[j], n[i-j], t0, t1, t2); 3464 } 3465 if ((i & 1) == 0) { 3466 MACC(a[j], a[j], t0, t1, t2); 3467 } 3468 for (; j < len; j++) { 3469 MACC(m[j], n[i-j], t0, t1, t2); 3470 } 3471 m[i-len] = t0; 3472 t0 = t1; t1 = t2; t2 = 0; 3473 } 3474 3475 while (t0) 3476 t0 = sub(m, n, t0, len); 3477 } 3478 3479 // Swap words in a longword. 3480 static julong swap(julong x) { 3481 return (x << 32) | (x >> 32); 3482 } 3483 3484 // Copy len longwords from s to d, word-swapping as we go. The 3485 // destination array is reversed. 3486 static void reverse_words(julong *s, julong *d, int len) { 3487 d += len; 3488 while(len-- > 0) { 3489 d--; 3490 *d = swap(*s); 3491 s++; 3492 } 3493 } 3494 3495 // The threshold at which squaring is advantageous was determined 3496 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. 3497 #define MONTGOMERY_SQUARING_THRESHOLD 64 3498 3499 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, 3500 jint len, jlong inv, 3501 jint *m_ints) { 3502 assert(len % 2 == 0, "array length in montgomery_multiply must be even"); 3503 int longwords = len/2; 3504 3505 // Make very sure we don't use so much space that the stack might 3506 // overflow. 512 jints corresponds to an 16384-bit integer and 3507 // will use here a total of 8k bytes of stack space. 3508 int divisor = sizeof(julong) * 4; 3509 guarantee(longwords <= 8192 / divisor, "must be"); 3510 int total_allocation = longwords * sizeof (julong) * 4; 3511 julong *scratch = (julong *)alloca(total_allocation); 3512 3513 // Local scratch arrays 3514 julong 3515 *a = scratch + 0 * longwords, 3516 *b = scratch + 1 * longwords, 3517 *n = scratch + 2 * longwords, 3518 *m = scratch + 3 * longwords; 3519 3520 reverse_words((julong *)a_ints, a, longwords); 3521 reverse_words((julong *)b_ints, b, longwords); 3522 reverse_words((julong *)n_ints, n, longwords); 3523 3524 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords); 3525 3526 reverse_words(m, (julong *)m_ints, longwords); 3527 } 3528 3529 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, 3530 jint len, jlong inv, 3531 jint *m_ints) { 3532 assert(len % 2 == 0, "array length in montgomery_square must be even"); 3533 int longwords = len/2; 3534 3535 // Make very sure we don't use so much space that the stack might 3536 // overflow. 512 jints corresponds to an 16384-bit integer and 3537 // will use here a total of 6k bytes of stack space. 3538 int divisor = sizeof(julong) * 3; 3539 guarantee(longwords <= (8192 / divisor), "must be"); 3540 int total_allocation = longwords * sizeof (julong) * 3; 3541 julong *scratch = (julong *)alloca(total_allocation); 3542 3543 // Local scratch arrays 3544 julong 3545 *a = scratch + 0 * longwords, 3546 *n = scratch + 1 * longwords, 3547 *m = scratch + 2 * longwords; 3548 3549 reverse_words((julong *)a_ints, a, longwords); 3550 reverse_words((julong *)n_ints, n, longwords); 3551 3552 if (len >= MONTGOMERY_SQUARING_THRESHOLD) { 3553 ::montgomery_square(a, n, m, (julong)inv, longwords); 3554 } else { 3555 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords); 3556 } 3557 3558 reverse_words(m, (julong *)m_ints, longwords); 3559 } 3560 3561 #if INCLUDE_JFR 3562 3563 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint. 3564 // It returns a jobject handle to the event writer. 3565 // The handle is dereferenced and the return value is the event writer oop. 3566 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() { 3567 enum layout { 3568 rbp_off, 3569 rbpH_off, 3570 return_off, 3571 return_off2, 3572 framesize // inclusive of return address 3573 }; 3574 3575 const char* name = SharedRuntime::stub_name(StubId::shared_jfr_write_checkpoint_id); 3576 CodeBuffer code(name, 1024, 64); 3577 MacroAssembler* masm = new MacroAssembler(&code); 3578 address start = __ pc(); 3579 3580 __ enter(); 3581 address the_pc = __ pc(); 3582 3583 int frame_complete = the_pc - start; 3584 3585 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 3586 __ movptr(c_rarg0, r15_thread); 3587 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1); 3588 __ reset_last_Java_frame(true); 3589 3590 // rax is jobject handle result, unpack and process it through a barrier. 3591 __ resolve_global_jobject(rax, c_rarg0); 3592 3593 __ leave(); 3594 __ ret(0); 3595 3596 OopMapSet* oop_maps = new OopMapSet(); 3597 OopMap* map = new OopMap(framesize, 1); 3598 oop_maps->add_gc_map(frame_complete, map); 3599 3600 RuntimeStub* stub = 3601 RuntimeStub::new_runtime_stub(name, 3602 &code, 3603 frame_complete, 3604 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3605 oop_maps, 3606 false); 3607 return stub; 3608 } 3609 3610 // For c2: call to return a leased buffer. 3611 RuntimeStub* SharedRuntime::generate_jfr_return_lease() { 3612 enum layout { 3613 rbp_off, 3614 rbpH_off, 3615 return_off, 3616 return_off2, 3617 framesize // inclusive of return address 3618 }; 3619 3620 const char* name = SharedRuntime::stub_name(StubId::shared_jfr_return_lease_id); 3621 CodeBuffer code(name, 1024, 64); 3622 MacroAssembler* masm = new MacroAssembler(&code); 3623 address start = __ pc(); 3624 3625 __ enter(); 3626 address the_pc = __ pc(); 3627 3628 int frame_complete = the_pc - start; 3629 3630 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2); 3631 __ movptr(c_rarg0, r15_thread); 3632 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1); 3633 __ reset_last_Java_frame(true); 3634 3635 __ leave(); 3636 __ ret(0); 3637 3638 OopMapSet* oop_maps = new OopMapSet(); 3639 OopMap* map = new OopMap(framesize, 1); 3640 oop_maps->add_gc_map(frame_complete, map); 3641 3642 RuntimeStub* stub = 3643 RuntimeStub::new_runtime_stub(name, 3644 &code, 3645 frame_complete, 3646 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3647 oop_maps, 3648 false); 3649 return stub; 3650 } 3651 3652 #endif // INCLUDE_JFR 3653