1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #ifndef _WINDOWS 26 #include "alloca.h" 27 #endif 28 #include "asm/macroAssembler.hpp" 29 #include "asm/macroAssembler.inline.hpp" 30 #include "code/aotCodeCache.hpp" 31 #include "code/compiledIC.hpp" 32 #include "code/debugInfoRec.hpp" 33 #include "code/nativeInst.hpp" 34 #include "code/vtableStubs.hpp" 35 #include "compiler/oopMap.hpp" 36 #include "gc/shared/collectedHeap.hpp" 37 #include "gc/shared/gcLocker.hpp" 38 #include "gc/shared/barrierSet.hpp" 39 #include "gc/shared/barrierSetAssembler.hpp" 40 #include "interpreter/interpreter.hpp" 41 #include "logging/log.hpp" 42 #include "memory/resourceArea.hpp" 43 #include "memory/universe.hpp" 44 #include "oops/klass.inline.hpp" 45 #include "oops/method.inline.hpp" 46 #include "prims/methodHandles.hpp" 47 #include "runtime/continuation.hpp" 48 #include "runtime/continuationEntry.inline.hpp" 49 #include "runtime/globals.hpp" 50 #include "runtime/jniHandles.hpp" 51 #include "runtime/safepointMechanism.hpp" 52 #include "runtime/sharedRuntime.hpp" 53 #include "runtime/signature.hpp" 54 #include "runtime/stubRoutines.hpp" 55 #include "runtime/timerTrace.hpp" 56 #include "runtime/vframeArray.hpp" 57 #include "runtime/vm_version.hpp" 58 #include "utilities/align.hpp" 59 #include "utilities/checkedCast.hpp" 60 #include "utilities/formatBuffer.hpp" 61 #include "vmreg_x86.inline.hpp" 62 #ifdef COMPILER1 63 #include "c1/c1_Runtime1.hpp" 64 #endif 65 #ifdef COMPILER2 66 #include "opto/runtime.hpp" 67 #endif 68 #if INCLUDE_JVMCI 69 #include "jvmci/jvmciJavaClasses.hpp" 70 #endif 71 72 #define __ masm-> 73 74 #ifdef PRODUCT 75 #define BLOCK_COMMENT(str) /* nothing */ 76 #else 77 #define BLOCK_COMMENT(str) __ block_comment(str) 78 #endif // PRODUCT 79 80 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size; 81 82 class RegisterSaver { 83 // Capture info about frame layout. Layout offsets are in jint 84 // units because compiler frame slots are jints. 85 #define XSAVE_AREA_BEGIN 160 86 #define XSAVE_AREA_YMM_BEGIN 576 87 #define XSAVE_AREA_EGPRS 960 88 #define XSAVE_AREA_OPMASK_BEGIN 1088 89 #define XSAVE_AREA_ZMM_BEGIN 1152 90 #define XSAVE_AREA_UPPERBANK 1664 91 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off 92 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off 93 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off 94 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off 95 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off 96 enum layout { 97 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area 98 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area 99 DEF_XMM_OFFS(0), 100 DEF_XMM_OFFS(1), 101 // 2..15 are implied in range usage 102 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 103 DEF_YMM_OFFS(0), 104 DEF_YMM_OFFS(1), 105 r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt, 106 r16H_off, 107 r17_off, r17H_off, 108 r18_off, r18H_off, 109 r19_off, r19H_off, 110 r20_off, r20H_off, 111 r21_off, r21H_off, 112 r22_off, r22H_off, 113 r23_off, r23H_off, 114 r24_off, r24H_off, 115 r25_off, r25H_off, 116 r26_off, r26H_off, 117 r27_off, r27H_off, 118 r28_off, r28H_off, 119 r29_off, r29H_off, 120 r30_off, r30H_off, 121 r31_off, r31H_off, 122 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 123 DEF_OPMASK_OFFS(0), 124 DEF_OPMASK_OFFS(1), 125 // 2..7 are implied in range usage 126 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 127 DEF_ZMM_OFFS(0), 128 DEF_ZMM_OFFS(1), 129 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt, 130 DEF_ZMM_UPPER_OFFS(16), 131 DEF_ZMM_UPPER_OFFS(17), 132 // 18..31 are implied in range usage 133 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt), 134 fpu_stateH_end, 135 r15_off, r15H_off, 136 r14_off, r14H_off, 137 r13_off, r13H_off, 138 r12_off, r12H_off, 139 r11_off, r11H_off, 140 r10_off, r10H_off, 141 r9_off, r9H_off, 142 r8_off, r8H_off, 143 rdi_off, rdiH_off, 144 rsi_off, rsiH_off, 145 ignore_off, ignoreH_off, // extra copy of rbp 146 rsp_off, rspH_off, 147 rbx_off, rbxH_off, 148 rdx_off, rdxH_off, 149 rcx_off, rcxH_off, 150 rax_off, raxH_off, 151 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state 152 align_off, alignH_off, 153 flags_off, flagsH_off, 154 // The frame sender code expects that rbp will be in the "natural" place and 155 // will override any oopMap setting for it. We must therefore force the layout 156 // so that it agrees with the frame sender code. 157 rbp_off, rbpH_off, // copy of rbp we will restore 158 return_off, returnH_off, // slot for return address 159 reg_save_size // size in compiler stack slots 160 }; 161 162 public: 163 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors); 164 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false); 165 166 // Offsets into the register save area 167 // Used by deoptimization when it is managing result register 168 // values on its own 169 170 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; } 171 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; } 172 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; } 173 static int r15_offset_in_bytes(void) { return BytesPerInt * r15_off; } 174 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; } 175 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; } 176 177 // During deoptimization only the result registers need to be restored, 178 // all the other values have already been extracted. 179 static void restore_result_registers(MacroAssembler* masm); 180 }; 181 182 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) { 183 int off = 0; 184 int num_xmm_regs = XMMRegister::available_xmm_registers(); 185 #if COMPILER2_OR_JVMCI 186 if (save_wide_vectors && UseAVX == 0) { 187 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX 188 } 189 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 190 #else 191 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI 192 #endif 193 194 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated 195 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs); 196 // OopMap frame size is in compiler stack slots (jint's) not bytes or words 197 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 198 // CodeBlob frame size is in words. 199 int frame_size_in_words = frame_size_in_bytes / wordSize; 200 *total_frame_words = frame_size_in_words; 201 202 // Save registers, fpu state, and flags. 203 // We assume caller has already pushed the return address onto the 204 // stack, so rsp is 8-byte aligned here. 205 // We push rpb twice in this sequence because we want the real rbp 206 // to be under the return like a normal enter. 207 208 __ enter(); // rsp becomes 16-byte aligned here 209 __ pushf(); 210 // Make sure rsp stays 16-byte aligned 211 __ subq(rsp, 8); 212 // Push CPU state in multiple of 16 bytes 213 __ save_legacy_gprs(); 214 __ push_FPU_state(); 215 216 217 // push cpu state handles this on EVEX enabled targets 218 if (save_wide_vectors) { 219 // Save upper half of YMM registers(0..15) 220 int base_addr = XSAVE_AREA_YMM_BEGIN; 221 for (int n = 0; n < 16; n++) { 222 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n)); 223 } 224 if (VM_Version::supports_evex()) { 225 // Save upper half of ZMM registers(0..15) 226 base_addr = XSAVE_AREA_ZMM_BEGIN; 227 for (int n = 0; n < 16; n++) { 228 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n)); 229 } 230 // Save full ZMM registers(16..num_xmm_regs) 231 base_addr = XSAVE_AREA_UPPERBANK; 232 off = 0; 233 int vector_len = Assembler::AVX_512bit; 234 for (int n = 16; n < num_xmm_regs; n++) { 235 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 236 } 237 #if COMPILER2_OR_JVMCI 238 base_addr = XSAVE_AREA_OPMASK_BEGIN; 239 off = 0; 240 for(int n = 0; n < KRegister::number_of_registers; n++) { 241 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 242 } 243 #endif 244 } 245 } else { 246 if (VM_Version::supports_evex()) { 247 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 248 int base_addr = XSAVE_AREA_UPPERBANK; 249 off = 0; 250 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 251 for (int n = 16; n < num_xmm_regs; n++) { 252 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 253 } 254 #if COMPILER2_OR_JVMCI 255 base_addr = XSAVE_AREA_OPMASK_BEGIN; 256 off = 0; 257 for(int n = 0; n < KRegister::number_of_registers; n++) { 258 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 259 } 260 #endif 261 } 262 } 263 264 #if COMPILER2_OR_JVMCI 265 if (UseAPX) { 266 int base_addr = XSAVE_AREA_EGPRS; 267 off = 0; 268 for (int n = 16; n < Register::number_of_registers; n++) { 269 __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n)); 270 } 271 } 272 #endif 273 274 __ vzeroupper(); 275 if (frame::arg_reg_save_area_bytes != 0) { 276 // Allocate argument register save area 277 __ subptr(rsp, frame::arg_reg_save_area_bytes); 278 } 279 280 // Set an oopmap for the call site. This oopmap will map all 281 // oop-registers and debug-info registers as callee-saved. This 282 // will allow deoptimization at this safepoint to find all possible 283 // debug-info recordings, as well as let GC find all oops. 284 285 OopMapSet *oop_maps = new OopMapSet(); 286 OopMap* map = new OopMap(frame_size_in_slots, 0); 287 288 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x)) 289 290 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg()); 291 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg()); 292 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg()); 293 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg()); 294 // rbp location is known implicitly by the frame sender code, needs no oopmap 295 // and the location where rbp was saved by is ignored 296 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg()); 297 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg()); 298 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg()); 299 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg()); 300 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg()); 301 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg()); 302 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg()); 303 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg()); 304 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg()); 305 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg()); 306 307 if (UseAPX) { 308 map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg()); 309 map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg()); 310 map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg()); 311 map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg()); 312 map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg()); 313 map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg()); 314 map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg()); 315 map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg()); 316 map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg()); 317 map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg()); 318 map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg()); 319 map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg()); 320 map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg()); 321 map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg()); 322 map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg()); 323 map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg()); 324 } 325 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 326 // on EVEX enabled targets, we get it included in the xsave area 327 off = xmm0_off; 328 int delta = xmm1_off - off; 329 for (int n = 0; n < 16; n++) { 330 XMMRegister xmm_name = as_XMMRegister(n); 331 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()); 332 off += delta; 333 } 334 if (UseAVX > 2) { 335 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 336 off = zmm16_off; 337 delta = zmm17_off - off; 338 for (int n = 16; n < num_xmm_regs; n++) { 339 XMMRegister zmm_name = as_XMMRegister(n); 340 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()); 341 off += delta; 342 } 343 } 344 345 #if COMPILER2_OR_JVMCI 346 if (save_wide_vectors) { 347 // Save upper half of YMM registers(0..15) 348 off = ymm0_off; 349 delta = ymm1_off - ymm0_off; 350 for (int n = 0; n < 16; n++) { 351 XMMRegister ymm_name = as_XMMRegister(n); 352 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4)); 353 off += delta; 354 } 355 if (VM_Version::supports_evex()) { 356 // Save upper half of ZMM registers(0..15) 357 off = zmm0_off; 358 delta = zmm1_off - zmm0_off; 359 for (int n = 0; n < 16; n++) { 360 XMMRegister zmm_name = as_XMMRegister(n); 361 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8)); 362 off += delta; 363 } 364 } 365 } 366 #endif // COMPILER2_OR_JVMCI 367 368 // %%% These should all be a waste but we'll keep things as they were for now 369 if (true) { 370 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next()); 371 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next()); 372 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next()); 373 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next()); 374 // rbp location is known implicitly by the frame sender code, needs no oopmap 375 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next()); 376 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next()); 377 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next()); 378 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next()); 379 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next()); 380 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next()); 381 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next()); 382 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next()); 383 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next()); 384 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next()); 385 if (UseAPX) { 386 map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next()); 387 map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next()); 388 map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next()); 389 map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next()); 390 map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next()); 391 map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next()); 392 map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next()); 393 map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next()); 394 map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next()); 395 map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next()); 396 map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next()); 397 map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next()); 398 map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next()); 399 map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next()); 400 map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next()); 401 map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next()); 402 } 403 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 404 // on EVEX enabled targets, we get it included in the xsave area 405 off = xmm0H_off; 406 delta = xmm1H_off - off; 407 for (int n = 0; n < 16; n++) { 408 XMMRegister xmm_name = as_XMMRegister(n); 409 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next()); 410 off += delta; 411 } 412 if (UseAVX > 2) { 413 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 414 off = zmm16H_off; 415 delta = zmm17H_off - off; 416 for (int n = 16; n < num_xmm_regs; n++) { 417 XMMRegister zmm_name = as_XMMRegister(n); 418 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next()); 419 off += delta; 420 } 421 } 422 } 423 424 return map; 425 } 426 427 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) { 428 int num_xmm_regs = XMMRegister::available_xmm_registers(); 429 if (frame::arg_reg_save_area_bytes != 0) { 430 // Pop arg register save area 431 __ addptr(rsp, frame::arg_reg_save_area_bytes); 432 } 433 434 #if COMPILER2_OR_JVMCI 435 if (restore_wide_vectors) { 436 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); 437 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 438 } 439 #else 440 assert(!restore_wide_vectors, "vectors are generated only by C2"); 441 #endif 442 443 __ vzeroupper(); 444 445 // On EVEX enabled targets everything is handled in pop fpu state 446 if (restore_wide_vectors) { 447 // Restore upper half of YMM registers (0..15) 448 int base_addr = XSAVE_AREA_YMM_BEGIN; 449 for (int n = 0; n < 16; n++) { 450 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16)); 451 } 452 if (VM_Version::supports_evex()) { 453 // Restore upper half of ZMM registers (0..15) 454 base_addr = XSAVE_AREA_ZMM_BEGIN; 455 for (int n = 0; n < 16; n++) { 456 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32)); 457 } 458 // Restore full ZMM registers(16..num_xmm_regs) 459 base_addr = XSAVE_AREA_UPPERBANK; 460 int vector_len = Assembler::AVX_512bit; 461 int off = 0; 462 for (int n = 16; n < num_xmm_regs; n++) { 463 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 464 } 465 #if COMPILER2_OR_JVMCI 466 base_addr = XSAVE_AREA_OPMASK_BEGIN; 467 off = 0; 468 for (int n = 0; n < KRegister::number_of_registers; n++) { 469 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 470 } 471 #endif 472 } 473 } else { 474 if (VM_Version::supports_evex()) { 475 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 476 int base_addr = XSAVE_AREA_UPPERBANK; 477 int off = 0; 478 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 479 for (int n = 16; n < num_xmm_regs; n++) { 480 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 481 } 482 #if COMPILER2_OR_JVMCI 483 base_addr = XSAVE_AREA_OPMASK_BEGIN; 484 off = 0; 485 for (int n = 0; n < KRegister::number_of_registers; n++) { 486 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 487 } 488 #endif 489 } 490 } 491 492 #if COMPILER2_OR_JVMCI 493 if (UseAPX) { 494 int base_addr = XSAVE_AREA_EGPRS; 495 int off = 0; 496 for (int n = 16; n < Register::number_of_registers; n++) { 497 __ movq(as_Register(n), Address(rsp, base_addr+(off++*8))); 498 } 499 } 500 #endif 501 502 // Recover CPU state 503 __ pop_FPU_state(); 504 __ restore_legacy_gprs(); 505 __ addq(rsp, 8); 506 __ popf(); 507 // Get the rbp described implicitly by the calling convention (no oopMap) 508 __ pop(rbp); 509 } 510 511 void RegisterSaver::restore_result_registers(MacroAssembler* masm) { 512 513 // Just restore result register. Only used by deoptimization. By 514 // now any callee save register that needs to be restored to a c2 515 // caller of the deoptee has been extracted into the vframeArray 516 // and will be stuffed into the c2i adapter we create for later 517 // restoration so only result registers need to be restored here. 518 519 // Restore fp result register 520 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes())); 521 // Restore integer result register 522 __ movptr(rax, Address(rsp, rax_offset_in_bytes())); 523 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes())); 524 525 // Pop all of the register save are off the stack except the return address 526 __ addptr(rsp, return_offset_in_bytes()); 527 } 528 529 // Is vector's size (in bytes) bigger than a size saved by default? 530 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions. 531 bool SharedRuntime::is_wide_vector(int size) { 532 return size > 16; 533 } 534 535 // --------------------------------------------------------------------------- 536 // Read the array of BasicTypes from a signature, and compute where the 537 // arguments should go. Values in the VMRegPair regs array refer to 4-byte 538 // quantities. Values less than VMRegImpl::stack0 are registers, those above 539 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer 540 // as framesizes are fixed. 541 // VMRegImpl::stack0 refers to the first slot 0(sp). 542 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher. 543 // Register up to Register::number_of_registers are the 64-bit 544 // integer registers. 545 546 // Note: the INPUTS in sig_bt are in units of Java argument words, which are 547 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit 548 // units regardless of build. Of course for i486 there is no 64 bit build 549 550 // The Java calling convention is a "shifted" version of the C ABI. 551 // By skipping the first C ABI register we can call non-static jni methods 552 // with small numbers of arguments without having to shuffle the arguments 553 // at all. Since we control the java ABI we ought to at least get some 554 // advantage out of it. 555 556 int SharedRuntime::java_calling_convention(const BasicType *sig_bt, 557 VMRegPair *regs, 558 int total_args_passed) { 559 560 // Create the mapping between argument positions and 561 // registers. 562 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = { 563 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5 564 }; 565 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = { 566 j_farg0, j_farg1, j_farg2, j_farg3, 567 j_farg4, j_farg5, j_farg6, j_farg7 568 }; 569 570 571 uint int_args = 0; 572 uint fp_args = 0; 573 uint stk_args = 0; 574 575 for (int i = 0; i < total_args_passed; i++) { 576 switch (sig_bt[i]) { 577 case T_BOOLEAN: 578 case T_CHAR: 579 case T_BYTE: 580 case T_SHORT: 581 case T_INT: 582 if (int_args < Argument::n_int_register_parameters_j) { 583 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 584 } else { 585 stk_args = align_up(stk_args, 2); 586 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 587 stk_args += 1; 588 } 589 break; 590 case T_VOID: 591 // halves of T_LONG or T_DOUBLE 592 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 593 regs[i].set_bad(); 594 break; 595 case T_LONG: 596 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 597 // fall through 598 case T_OBJECT: 599 case T_ARRAY: 600 case T_ADDRESS: 601 if (int_args < Argument::n_int_register_parameters_j) { 602 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 603 } else { 604 stk_args = align_up(stk_args, 2); 605 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 606 stk_args += 2; 607 } 608 break; 609 case T_FLOAT: 610 if (fp_args < Argument::n_float_register_parameters_j) { 611 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 612 } else { 613 stk_args = align_up(stk_args, 2); 614 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 615 stk_args += 1; 616 } 617 break; 618 case T_DOUBLE: 619 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 620 if (fp_args < Argument::n_float_register_parameters_j) { 621 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 622 } else { 623 stk_args = align_up(stk_args, 2); 624 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 625 stk_args += 2; 626 } 627 break; 628 default: 629 ShouldNotReachHere(); 630 break; 631 } 632 } 633 634 return stk_args; 635 } 636 637 // Patch the callers callsite with entry to compiled code if it exists. 638 static void patch_callers_callsite(MacroAssembler *masm) { 639 Label L; 640 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 641 __ jcc(Assembler::equal, L); 642 643 // Save the current stack pointer 644 __ mov(r13, rsp); 645 // Schedule the branch target address early. 646 // Call into the VM to patch the caller, then jump to compiled callee 647 // rax isn't live so capture return address while we easily can 648 __ movptr(rax, Address(rsp, 0)); 649 650 // align stack so push_CPU_state doesn't fault 651 __ andptr(rsp, -(StackAlignmentInBytes)); 652 __ push_CPU_state(); 653 __ vzeroupper(); 654 // VM needs caller's callsite 655 // VM needs target method 656 // This needs to be a long call since we will relocate this adapter to 657 // the codeBuffer and it may not reach 658 659 // Allocate argument register save area 660 if (frame::arg_reg_save_area_bytes != 0) { 661 __ subptr(rsp, frame::arg_reg_save_area_bytes); 662 } 663 __ mov(c_rarg0, rbx); 664 __ mov(c_rarg1, rax); 665 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite))); 666 667 // De-allocate argument register save area 668 if (frame::arg_reg_save_area_bytes != 0) { 669 __ addptr(rsp, frame::arg_reg_save_area_bytes); 670 } 671 672 __ vzeroupper(); 673 __ pop_CPU_state(); 674 // restore sp 675 __ mov(rsp, r13); 676 __ bind(L); 677 } 678 679 static void gen_c2i_adapter(MacroAssembler *masm, 680 int total_args_passed, 681 int comp_args_on_stack, 682 const BasicType *sig_bt, 683 const VMRegPair *regs, 684 Label& skip_fixup) { 685 // Before we get into the guts of the C2I adapter, see if we should be here 686 // at all. We've come from compiled code and are attempting to jump to the 687 // interpreter, which means the caller made a static call to get here 688 // (vcalls always get a compiled target if there is one). Check for a 689 // compiled target. If there is one, we need to patch the caller's call. 690 patch_callers_callsite(masm); 691 692 __ bind(skip_fixup); 693 694 // Since all args are passed on the stack, total_args_passed * 695 // Interpreter::stackElementSize is the space we need. 696 697 assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed); 698 699 int extraspace = (total_args_passed * Interpreter::stackElementSize); 700 701 // stack is aligned, keep it that way 702 // This is not currently needed or enforced by the interpreter, but 703 // we might as well conform to the ABI. 704 extraspace = align_up(extraspace, 2*wordSize); 705 706 // set senderSP value 707 __ lea(r13, Address(rsp, wordSize)); 708 709 #ifdef ASSERT 710 __ check_stack_alignment(r13, "sender stack not aligned"); 711 #endif 712 if (extraspace > 0) { 713 // Pop the return address 714 __ pop(rax); 715 716 __ subptr(rsp, extraspace); 717 718 // Push the return address 719 __ push(rax); 720 721 // Account for the return address location since we store it first rather 722 // than hold it in a register across all the shuffling 723 extraspace += wordSize; 724 } 725 726 #ifdef ASSERT 727 __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax); 728 #endif 729 730 // Now write the args into the outgoing interpreter space 731 for (int i = 0; i < total_args_passed; i++) { 732 if (sig_bt[i] == T_VOID) { 733 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 734 continue; 735 } 736 737 // offset to start parameters 738 int st_off = (total_args_passed - i) * Interpreter::stackElementSize; 739 int next_off = st_off - Interpreter::stackElementSize; 740 741 // Say 4 args: 742 // i st_off 743 // 0 32 T_LONG 744 // 1 24 T_VOID 745 // 2 16 T_OBJECT 746 // 3 8 T_BOOL 747 // - 0 return address 748 // 749 // However to make thing extra confusing. Because we can fit a long/double in 750 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter 751 // leaves one slot empty and only stores to a single slot. In this case the 752 // slot that is occupied is the T_VOID slot. See I said it was confusing. 753 754 VMReg r_1 = regs[i].first(); 755 VMReg r_2 = regs[i].second(); 756 if (!r_1->is_valid()) { 757 assert(!r_2->is_valid(), ""); 758 continue; 759 } 760 if (r_1->is_stack()) { 761 // memory to memory use rax 762 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace; 763 if (!r_2->is_valid()) { 764 // sign extend?? 765 __ movl(rax, Address(rsp, ld_off)); 766 __ movptr(Address(rsp, st_off), rax); 767 768 } else { 769 770 __ movq(rax, Address(rsp, ld_off)); 771 772 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 773 // T_DOUBLE and T_LONG use two slots in the interpreter 774 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 775 // ld_off == LSW, ld_off+wordSize == MSW 776 // st_off == MSW, next_off == LSW 777 __ movq(Address(rsp, next_off), rax); 778 #ifdef ASSERT 779 // Overwrite the unused slot with known junk 780 __ mov64(rax, CONST64(0xdeadffffdeadaaaa)); 781 __ movptr(Address(rsp, st_off), rax); 782 #endif /* ASSERT */ 783 } else { 784 __ movq(Address(rsp, st_off), rax); 785 } 786 } 787 } else if (r_1->is_Register()) { 788 Register r = r_1->as_Register(); 789 if (!r_2->is_valid()) { 790 // must be only an int (or less ) so move only 32bits to slot 791 // why not sign extend?? 792 __ movl(Address(rsp, st_off), r); 793 } else { 794 // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG 795 // T_DOUBLE and T_LONG use two slots in the interpreter 796 if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) { 797 // long/double in gpr 798 #ifdef ASSERT 799 // Overwrite the unused slot with known junk 800 __ mov64(rax, CONST64(0xdeadffffdeadaaab)); 801 __ movptr(Address(rsp, st_off), rax); 802 #endif /* ASSERT */ 803 __ movq(Address(rsp, next_off), r); 804 } else { 805 __ movptr(Address(rsp, st_off), r); 806 } 807 } 808 } else { 809 assert(r_1->is_XMMRegister(), ""); 810 if (!r_2->is_valid()) { 811 // only a float use just part of the slot 812 __ movflt(Address(rsp, st_off), r_1->as_XMMRegister()); 813 } else { 814 #ifdef ASSERT 815 // Overwrite the unused slot with known junk 816 __ mov64(rax, CONST64(0xdeadffffdeadaaac)); 817 __ movptr(Address(rsp, st_off), rax); 818 #endif /* ASSERT */ 819 __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister()); 820 } 821 } 822 } 823 824 // Schedule the branch target address early. 825 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset()))); 826 __ jmp(rcx); 827 } 828 829 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm, 830 int total_args_passed, 831 int comp_args_on_stack, 832 const BasicType *sig_bt, 833 const VMRegPair *regs) { 834 835 // Note: r13 contains the senderSP on entry. We must preserve it since 836 // we may do a i2c -> c2i transition if we lose a race where compiled 837 // code goes non-entrant while we get args ready. 838 // In addition we use r13 to locate all the interpreter args as 839 // we must align the stack to 16 bytes on an i2c entry else we 840 // lose alignment we expect in all compiled code and register 841 // save code can segv when fxsave instructions find improperly 842 // aligned stack pointer. 843 844 // Adapters can be frameless because they do not require the caller 845 // to perform additional cleanup work, such as correcting the stack pointer. 846 // An i2c adapter is frameless because the *caller* frame, which is interpreted, 847 // routinely repairs its own stack pointer (from interpreter_frame_last_sp), 848 // even if a callee has modified the stack pointer. 849 // A c2i adapter is frameless because the *callee* frame, which is interpreted, 850 // routinely repairs its caller's stack pointer (from sender_sp, which is set 851 // up via the senderSP register). 852 // In other words, if *either* the caller or callee is interpreted, we can 853 // get the stack pointer repaired after a call. 854 // This is why c2i and i2c adapters cannot be indefinitely composed. 855 // In particular, if a c2i adapter were to somehow call an i2c adapter, 856 // both caller and callee would be compiled methods, and neither would 857 // clean up the stack pointer changes performed by the two adapters. 858 // If this happens, control eventually transfers back to the compiled 859 // caller, but with an uncorrected stack, causing delayed havoc. 860 861 // Must preserve original SP for loading incoming arguments because 862 // we need to align the outgoing SP for compiled code. 863 __ movptr(r11, rsp); 864 865 // Pick up the return address 866 __ pop(rax); 867 868 // Convert 4-byte c2 stack slots to words. 869 int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord; 870 871 if (comp_args_on_stack) { 872 __ subptr(rsp, comp_words_on_stack * wordSize); 873 } 874 875 // Ensure compiled code always sees stack at proper alignment 876 __ andptr(rsp, -16); 877 878 // push the return address and misalign the stack that youngest frame always sees 879 // as far as the placement of the call instruction 880 __ push(rax); 881 882 // Put saved SP in another register 883 const Register saved_sp = rax; 884 __ movptr(saved_sp, r11); 885 886 // Will jump to the compiled code just as if compiled code was doing it. 887 // Pre-load the register-jump target early, to schedule it better. 888 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset()))); 889 890 #if INCLUDE_JVMCI 891 if (EnableJVMCI) { 892 // check if this call should be routed towards a specific entry point 893 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 894 Label no_alternative_target; 895 __ jcc(Assembler::equal, no_alternative_target); 896 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset()))); 897 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 898 __ bind(no_alternative_target); 899 } 900 #endif // INCLUDE_JVMCI 901 902 // Now generate the shuffle code. Pick up all register args and move the 903 // rest through the floating point stack top. 904 for (int i = 0; i < total_args_passed; i++) { 905 if (sig_bt[i] == T_VOID) { 906 // Longs and doubles are passed in native word order, but misaligned 907 // in the 32-bit build. 908 assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half"); 909 continue; 910 } 911 912 // Pick up 0, 1 or 2 words from SP+offset. 913 914 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), 915 "scrambled load targets?"); 916 // Load in argument order going down. 917 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize; 918 // Point to interpreter value (vs. tag) 919 int next_off = ld_off - Interpreter::stackElementSize; 920 // 921 // 922 // 923 VMReg r_1 = regs[i].first(); 924 VMReg r_2 = regs[i].second(); 925 if (!r_1->is_valid()) { 926 assert(!r_2->is_valid(), ""); 927 continue; 928 } 929 if (r_1->is_stack()) { 930 // Convert stack slot to an SP offset (+ wordSize to account for return address ) 931 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize; 932 933 // We can use r13 as a temp here because compiled code doesn't need r13 as an input 934 // and if we end up going thru a c2i because of a miss a reasonable value of r13 935 // will be generated. 936 if (!r_2->is_valid()) { 937 // sign extend??? 938 __ movl(r13, Address(saved_sp, ld_off)); 939 __ movptr(Address(rsp, st_off), r13); 940 } else { 941 // 942 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 943 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 944 // So we must adjust where to pick up the data to match the interpreter. 945 // 946 // Interpreter local[n] == MSW, local[n+1] == LSW however locals 947 // are accessed as negative so LSW is at LOW address 948 949 // ld_off is MSW so get LSW 950 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 951 next_off : ld_off; 952 __ movq(r13, Address(saved_sp, offset)); 953 // st_off is LSW (i.e. reg.first()) 954 __ movq(Address(rsp, st_off), r13); 955 } 956 } else if (r_1->is_Register()) { // Register argument 957 Register r = r_1->as_Register(); 958 assert(r != rax, "must be different"); 959 if (r_2->is_valid()) { 960 // 961 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 962 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 963 // So we must adjust where to pick up the data to match the interpreter. 964 965 const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)? 966 next_off : ld_off; 967 968 // this can be a misaligned move 969 __ movq(r, Address(saved_sp, offset)); 970 } else { 971 // sign extend and use a full word? 972 __ movl(r, Address(saved_sp, ld_off)); 973 } 974 } else { 975 if (!r_2->is_valid()) { 976 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off)); 977 } else { 978 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off)); 979 } 980 } 981 } 982 983 __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about 984 985 // 6243940 We might end up in handle_wrong_method if 986 // the callee is deoptimized as we race thru here. If that 987 // happens we don't want to take a safepoint because the 988 // caller frame will look interpreted and arguments are now 989 // "compiled" so it is much better to make this transition 990 // invisible to the stack walking code. Unfortunately if 991 // we try and find the callee by normal means a safepoint 992 // is possible. So we stash the desired callee in the thread 993 // and the vm will find there should this case occur. 994 995 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx); 996 997 // put Method* where a c2i would expect should we end up there 998 // only needed because eof c2 resolve stubs return Method* as a result in 999 // rax 1000 __ mov(rax, rbx); 1001 __ jmp(r11); 1002 } 1003 1004 // --------------------------------------------------------------- 1005 void SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm, 1006 int total_args_passed, 1007 int comp_args_on_stack, 1008 const BasicType *sig_bt, 1009 const VMRegPair *regs, 1010 address entry_address[AdapterBlob::ENTRY_COUNT]) { 1011 entry_address[AdapterBlob::I2C] = __ pc(); 1012 1013 gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs); 1014 1015 // ------------------------------------------------------------------------- 1016 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls 1017 // to the interpreter. The args start out packed in the compiled layout. They 1018 // need to be unpacked into the interpreter layout. This will almost always 1019 // require some stack space. We grow the current (compiled) stack, then repack 1020 // the args. We finally end in a jump to the generic interpreter entry point. 1021 // On exit from the interpreter, the interpreter will restore our SP (lest the 1022 // compiled code, which relies solely on SP and not RBP, get sick). 1023 1024 entry_address[AdapterBlob::C2I_Unverified] = __ pc(); 1025 Label skip_fixup; 1026 1027 Register data = rax; 1028 Register receiver = j_rarg0; 1029 Register temp = rbx; 1030 1031 { 1032 __ ic_check(1 /* end_alignment */); 1033 __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset())); 1034 // Method might have been compiled since the call site was patched to 1035 // interpreted if that is the case treat it as a miss so we can get 1036 // the call site corrected. 1037 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 1038 __ jcc(Assembler::equal, skip_fixup); 1039 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1040 } 1041 1042 entry_address[AdapterBlob::C2I] = __ pc(); 1043 1044 // Class initialization barrier for static methods 1045 entry_address[AdapterBlob::C2I_No_Clinit_Check] = nullptr; 1046 if (VM_Version::supports_fast_class_init_checks()) { 1047 Label L_skip_barrier; 1048 Register method = rbx; 1049 1050 { // Bypass the barrier for non-static methods 1051 Register flags = rscratch1; 1052 __ load_unsigned_short(flags, Address(method, Method::access_flags_offset())); 1053 __ testl(flags, JVM_ACC_STATIC); 1054 __ jcc(Assembler::zero, L_skip_barrier); // non-static 1055 } 1056 1057 Register klass = rscratch1; 1058 __ load_method_holder(klass, method); 1059 __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/); 1060 1061 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1062 1063 __ bind(L_skip_barrier); 1064 entry_address[AdapterBlob::C2I_No_Clinit_Check] = __ pc(); 1065 } 1066 1067 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1068 bs->c2i_entry_barrier(masm); 1069 1070 gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup); 1071 return; 1072 } 1073 1074 int SharedRuntime::c_calling_convention(const BasicType *sig_bt, 1075 VMRegPair *regs, 1076 int total_args_passed) { 1077 1078 // We return the amount of VMRegImpl stack slots we need to reserve for all 1079 // the arguments NOT counting out_preserve_stack_slots. 1080 1081 // NOTE: These arrays will have to change when c1 is ported 1082 #ifdef _WIN64 1083 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1084 c_rarg0, c_rarg1, c_rarg2, c_rarg3 1085 }; 1086 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1087 c_farg0, c_farg1, c_farg2, c_farg3 1088 }; 1089 #else 1090 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1091 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5 1092 }; 1093 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1094 c_farg0, c_farg1, c_farg2, c_farg3, 1095 c_farg4, c_farg5, c_farg6, c_farg7 1096 }; 1097 #endif // _WIN64 1098 1099 1100 uint int_args = 0; 1101 uint fp_args = 0; 1102 uint stk_args = 0; // inc by 2 each time 1103 1104 for (int i = 0; i < total_args_passed; i++) { 1105 switch (sig_bt[i]) { 1106 case T_BOOLEAN: 1107 case T_CHAR: 1108 case T_BYTE: 1109 case T_SHORT: 1110 case T_INT: 1111 if (int_args < Argument::n_int_register_parameters_c) { 1112 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 1113 #ifdef _WIN64 1114 fp_args++; 1115 // Allocate slots for callee to stuff register args the stack. 1116 stk_args += 2; 1117 #endif 1118 } else { 1119 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1120 stk_args += 2; 1121 } 1122 break; 1123 case T_LONG: 1124 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1125 // fall through 1126 case T_OBJECT: 1127 case T_ARRAY: 1128 case T_ADDRESS: 1129 case T_METADATA: 1130 if (int_args < Argument::n_int_register_parameters_c) { 1131 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 1132 #ifdef _WIN64 1133 fp_args++; 1134 stk_args += 2; 1135 #endif 1136 } else { 1137 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1138 stk_args += 2; 1139 } 1140 break; 1141 case T_FLOAT: 1142 if (fp_args < Argument::n_float_register_parameters_c) { 1143 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 1144 #ifdef _WIN64 1145 int_args++; 1146 // Allocate slots for callee to stuff register args the stack. 1147 stk_args += 2; 1148 #endif 1149 } else { 1150 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1151 stk_args += 2; 1152 } 1153 break; 1154 case T_DOUBLE: 1155 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1156 if (fp_args < Argument::n_float_register_parameters_c) { 1157 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 1158 #ifdef _WIN64 1159 int_args++; 1160 // Allocate slots for callee to stuff register args the stack. 1161 stk_args += 2; 1162 #endif 1163 } else { 1164 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1165 stk_args += 2; 1166 } 1167 break; 1168 case T_VOID: // Halves of longs and doubles 1169 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 1170 regs[i].set_bad(); 1171 break; 1172 default: 1173 ShouldNotReachHere(); 1174 break; 1175 } 1176 } 1177 #ifdef _WIN64 1178 // windows abi requires that we always allocate enough stack space 1179 // for 4 64bit registers to be stored down. 1180 if (stk_args < 8) { 1181 stk_args = 8; 1182 } 1183 #endif // _WIN64 1184 1185 return stk_args; 1186 } 1187 1188 int SharedRuntime::vector_calling_convention(VMRegPair *regs, 1189 uint num_bits, 1190 uint total_args_passed) { 1191 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512, 1192 "only certain vector sizes are supported for now"); 1193 1194 static const XMMRegister VEC_ArgReg[32] = { 1195 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 1196 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1197 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23, 1198 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31 1199 }; 1200 1201 uint stk_args = 0; 1202 uint fp_args = 0; 1203 1204 for (uint i = 0; i < total_args_passed; i++) { 1205 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg(); 1206 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15)); 1207 regs[i].set_pair(vmreg->next(next_val), vmreg); 1208 } 1209 1210 return stk_args; 1211 } 1212 1213 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1214 // We always ignore the frame_slots arg and just use the space just below frame pointer 1215 // which by this time is free to use 1216 switch (ret_type) { 1217 case T_FLOAT: 1218 __ movflt(Address(rbp, -wordSize), xmm0); 1219 break; 1220 case T_DOUBLE: 1221 __ movdbl(Address(rbp, -wordSize), xmm0); 1222 break; 1223 case T_VOID: break; 1224 default: { 1225 __ movptr(Address(rbp, -wordSize), rax); 1226 } 1227 } 1228 } 1229 1230 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1231 // We always ignore the frame_slots arg and just use the space just below frame pointer 1232 // which by this time is free to use 1233 switch (ret_type) { 1234 case T_FLOAT: 1235 __ movflt(xmm0, Address(rbp, -wordSize)); 1236 break; 1237 case T_DOUBLE: 1238 __ movdbl(xmm0, Address(rbp, -wordSize)); 1239 break; 1240 case T_VOID: break; 1241 default: { 1242 __ movptr(rax, Address(rbp, -wordSize)); 1243 } 1244 } 1245 } 1246 1247 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1248 for ( int i = first_arg ; i < arg_count ; i++ ) { 1249 if (args[i].first()->is_Register()) { 1250 __ push(args[i].first()->as_Register()); 1251 } else if (args[i].first()->is_XMMRegister()) { 1252 __ subptr(rsp, 2*wordSize); 1253 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister()); 1254 } 1255 } 1256 } 1257 1258 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1259 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) { 1260 if (args[i].first()->is_Register()) { 1261 __ pop(args[i].first()->as_Register()); 1262 } else if (args[i].first()->is_XMMRegister()) { 1263 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0)); 1264 __ addptr(rsp, 2*wordSize); 1265 } 1266 } 1267 } 1268 1269 static void verify_oop_args(MacroAssembler* masm, 1270 const methodHandle& method, 1271 const BasicType* sig_bt, 1272 const VMRegPair* regs) { 1273 Register temp_reg = rbx; // not part of any compiled calling seq 1274 if (VerifyOops) { 1275 for (int i = 0; i < method->size_of_parameters(); i++) { 1276 if (is_reference_type(sig_bt[i])) { 1277 VMReg r = regs[i].first(); 1278 assert(r->is_valid(), "bad oop arg"); 1279 if (r->is_stack()) { 1280 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1281 __ verify_oop(temp_reg); 1282 } else { 1283 __ verify_oop(r->as_Register()); 1284 } 1285 } 1286 } 1287 } 1288 } 1289 1290 static void check_continuation_enter_argument(VMReg actual_vmreg, 1291 Register expected_reg, 1292 const char* name) { 1293 assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name); 1294 assert(actual_vmreg->as_Register() == expected_reg, 1295 "%s is in unexpected register: %s instead of %s", 1296 name, actual_vmreg->as_Register()->name(), expected_reg->name()); 1297 } 1298 1299 1300 //---------------------------- continuation_enter_setup --------------------------- 1301 // 1302 // Arguments: 1303 // None. 1304 // 1305 // Results: 1306 // rsp: pointer to blank ContinuationEntry 1307 // 1308 // Kills: 1309 // rax 1310 // 1311 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) { 1312 assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, ""); 1313 assert(in_bytes(ContinuationEntry::cont_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1314 assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1315 1316 stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize; 1317 __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1318 1319 int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size; 1320 OopMap* map = new OopMap(frame_size, 0); 1321 1322 __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset())); 1323 __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax); 1324 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp); 1325 1326 return map; 1327 } 1328 1329 //---------------------------- fill_continuation_entry --------------------------- 1330 // 1331 // Arguments: 1332 // rsp: pointer to blank Continuation entry 1333 // reg_cont_obj: pointer to the continuation 1334 // reg_flags: flags 1335 // 1336 // Results: 1337 // rsp: pointer to filled out ContinuationEntry 1338 // 1339 // Kills: 1340 // rax 1341 // 1342 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) { 1343 assert_different_registers(rax, reg_cont_obj, reg_flags); 1344 #ifdef ASSERT 1345 __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value()); 1346 #endif 1347 __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj); 1348 __ movl (Address(rsp, ContinuationEntry::flags_offset()), reg_flags); 1349 __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0); 1350 __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0); 1351 __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0); 1352 1353 __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset())); 1354 __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax); 1355 __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset())); 1356 __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax); 1357 1358 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0); 1359 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0); 1360 } 1361 1362 //---------------------------- continuation_enter_cleanup --------------------------- 1363 // 1364 // Arguments: 1365 // rsp: pointer to the ContinuationEntry 1366 // 1367 // Results: 1368 // rsp: pointer to the spilled rbp in the entry frame 1369 // 1370 // Kills: 1371 // rbx 1372 // 1373 static void continuation_enter_cleanup(MacroAssembler* masm) { 1374 #ifdef ASSERT 1375 Label L_good_sp; 1376 __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1377 __ jcc(Assembler::equal, L_good_sp); 1378 __ stop("Incorrect rsp at continuation_enter_cleanup"); 1379 __ bind(L_good_sp); 1380 #endif 1381 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset())); 1382 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx); 1383 1384 if (CheckJNICalls) { 1385 // Check if this is a virtual thread continuation 1386 Label L_skip_vthread_code; 1387 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0); 1388 __ jcc(Assembler::equal, L_skip_vthread_code); 1389 1390 // If the held monitor count is > 0 and this vthread is terminating then 1391 // it failed to release a JNI monitor. So we issue the same log message 1392 // that JavaThread::exit does. 1393 __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1394 __ jcc(Assembler::equal, L_skip_vthread_code); 1395 1396 // rax may hold an exception oop, save it before the call 1397 __ push(rax); 1398 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held)); 1399 __ pop(rax); 1400 1401 // For vthreads we have to explicitly zero the JNI monitor count of the carrier 1402 // on termination. The held count is implicitly zeroed below when we restore from 1403 // the parent held count (which has to be zero). 1404 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1405 1406 __ bind(L_skip_vthread_code); 1407 } 1408 #ifdef ASSERT 1409 else { 1410 // Check if this is a virtual thread continuation 1411 Label L_skip_vthread_code; 1412 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0); 1413 __ jcc(Assembler::equal, L_skip_vthread_code); 1414 1415 // See comment just above. If not checking JNI calls the JNI count is only 1416 // needed for assertion checking. 1417 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1418 1419 __ bind(L_skip_vthread_code); 1420 } 1421 #endif 1422 1423 __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset())); 1424 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx); 1425 1426 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset())); 1427 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx); 1428 __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1429 } 1430 1431 static void gen_continuation_enter(MacroAssembler* masm, 1432 const VMRegPair* regs, 1433 int& exception_offset, 1434 OopMapSet* oop_maps, 1435 int& frame_complete, 1436 int& stack_slots, 1437 int& interpreted_entry_offset, 1438 int& compiled_entry_offset) { 1439 1440 // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread) 1441 int pos_cont_obj = 0; 1442 int pos_is_cont = 1; 1443 int pos_is_virtual = 2; 1444 1445 // The platform-specific calling convention may present the arguments in various registers. 1446 // To simplify the rest of the code, we expect the arguments to reside at these known 1447 // registers, and we additionally check the placement here in case calling convention ever 1448 // changes. 1449 Register reg_cont_obj = c_rarg1; 1450 Register reg_is_cont = c_rarg2; 1451 Register reg_is_virtual = c_rarg3; 1452 1453 check_continuation_enter_argument(regs[pos_cont_obj].first(), reg_cont_obj, "Continuation object"); 1454 check_continuation_enter_argument(regs[pos_is_cont].first(), reg_is_cont, "isContinue"); 1455 check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread"); 1456 1457 // Utility methods kill rax, make sure there are no collisions 1458 assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual); 1459 1460 AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(), 1461 relocInfo::static_call_type); 1462 1463 address start = __ pc(); 1464 1465 Label L_thaw, L_exit; 1466 1467 // i2i entry used at interp_only_mode only 1468 interpreted_entry_offset = __ pc() - start; 1469 { 1470 #ifdef ASSERT 1471 Label is_interp_only; 1472 __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0); 1473 __ jcc(Assembler::notEqual, is_interp_only); 1474 __ stop("enterSpecial interpreter entry called when not in interp_only_mode"); 1475 __ bind(is_interp_only); 1476 #endif 1477 1478 __ pop(rax); // return address 1479 // Read interpreter arguments into registers (this is an ad-hoc i2c adapter) 1480 __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2)); 1481 __ movl(c_rarg2, Address(rsp, Interpreter::stackElementSize*1)); 1482 __ movl(c_rarg3, Address(rsp, Interpreter::stackElementSize*0)); 1483 __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment 1484 __ push(rax); // return address 1485 __ push_cont_fastpath(); 1486 1487 __ enter(); 1488 1489 stack_slots = 2; // will be adjusted in setup 1490 OopMap* map = continuation_enter_setup(masm, stack_slots); 1491 // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe, 1492 // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway. 1493 1494 __ verify_oop(reg_cont_obj); 1495 1496 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1497 1498 // If continuation, call to thaw. Otherwise, resolve the call and exit. 1499 __ testptr(reg_is_cont, reg_is_cont); 1500 __ jcc(Assembler::notZero, L_thaw); 1501 1502 // --- Resolve path 1503 1504 // Make sure the call is patchable 1505 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1506 // Emit stub for static call 1507 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc()); 1508 if (stub == nullptr) { 1509 fatal("CodeCache is full at gen_continuation_enter"); 1510 } 1511 __ call(resolve); 1512 oop_maps->add_gc_map(__ pc() - start, map); 1513 __ post_call_nop(); 1514 1515 __ jmp(L_exit); 1516 } 1517 1518 // compiled entry 1519 __ align(CodeEntryAlignment); 1520 compiled_entry_offset = __ pc() - start; 1521 __ enter(); 1522 1523 stack_slots = 2; // will be adjusted in setup 1524 OopMap* map = continuation_enter_setup(masm, stack_slots); 1525 1526 // Frame is now completed as far as size and linkage. 1527 frame_complete = __ pc() - start; 1528 1529 __ verify_oop(reg_cont_obj); 1530 1531 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1532 1533 // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue) 1534 __ testptr(reg_is_cont, reg_is_cont); 1535 __ jccb(Assembler::notZero, L_thaw); 1536 1537 // --- call Continuation.enter(Continuation c, boolean isContinue) 1538 1539 // Make sure the call is patchable 1540 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1541 1542 // Emit stub for static call 1543 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc()); 1544 if (stub == nullptr) { 1545 fatal("CodeCache is full at gen_continuation_enter"); 1546 } 1547 1548 // The call needs to be resolved. There's a special case for this in 1549 // SharedRuntime::find_callee_info_helper() which calls 1550 // LinkResolver::resolve_continuation_enter() which resolves the call to 1551 // Continuation.enter(Continuation c, boolean isContinue). 1552 __ call(resolve); 1553 1554 oop_maps->add_gc_map(__ pc() - start, map); 1555 __ post_call_nop(); 1556 1557 __ jmpb(L_exit); 1558 1559 // --- Thawing path 1560 1561 __ bind(L_thaw); 1562 1563 ContinuationEntry::_thaw_call_pc_offset = __ pc() - start; 1564 __ call(RuntimeAddress(StubRoutines::cont_thaw())); 1565 1566 ContinuationEntry::_return_pc_offset = __ pc() - start; 1567 oop_maps->add_gc_map(__ pc() - start, map->deep_copy()); 1568 __ post_call_nop(); 1569 1570 // --- Normal exit (resolve/thawing) 1571 1572 __ bind(L_exit); 1573 ContinuationEntry::_cleanup_offset = __ pc() - start; 1574 continuation_enter_cleanup(masm); 1575 __ pop(rbp); 1576 __ ret(0); 1577 1578 // --- Exception handling path 1579 1580 exception_offset = __ pc() - start; 1581 1582 continuation_enter_cleanup(masm); 1583 __ pop(rbp); 1584 1585 __ movptr(c_rarg0, r15_thread); 1586 __ movptr(c_rarg1, Address(rsp, 0)); // return address 1587 1588 // rax still holds the original exception oop, save it before the call 1589 __ push(rax); 1590 1591 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2); 1592 __ movptr(rbx, rax); 1593 1594 // Continue at exception handler: 1595 // rax: exception oop 1596 // rbx: exception handler 1597 // rdx: exception pc 1598 __ pop(rax); 1599 __ verify_oop(rax); 1600 __ pop(rdx); 1601 __ jmp(rbx); 1602 } 1603 1604 static void gen_continuation_yield(MacroAssembler* masm, 1605 const VMRegPair* regs, 1606 OopMapSet* oop_maps, 1607 int& frame_complete, 1608 int& stack_slots, 1609 int& compiled_entry_offset) { 1610 enum layout { 1611 rbp_off, 1612 rbpH_off, 1613 return_off, 1614 return_off2, 1615 framesize // inclusive of return address 1616 }; 1617 stack_slots = framesize / VMRegImpl::slots_per_word; 1618 assert(stack_slots == 2, "recheck layout"); 1619 1620 address start = __ pc(); 1621 compiled_entry_offset = __ pc() - start; 1622 __ enter(); 1623 address the_pc = __ pc(); 1624 1625 frame_complete = the_pc - start; 1626 1627 // This nop must be exactly at the PC we push into the frame info. 1628 // We use this nop for fast CodeBlob lookup, associate the OopMap 1629 // with it right away. 1630 __ post_call_nop(); 1631 OopMap* map = new OopMap(framesize, 1); 1632 oop_maps->add_gc_map(frame_complete, map); 1633 1634 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 1635 __ movptr(c_rarg0, r15_thread); 1636 __ movptr(c_rarg1, rsp); 1637 __ call_VM_leaf(Continuation::freeze_entry(), 2); 1638 __ reset_last_Java_frame(true); 1639 1640 Label L_pinned; 1641 1642 __ testptr(rax, rax); 1643 __ jcc(Assembler::notZero, L_pinned); 1644 1645 __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1646 continuation_enter_cleanup(masm); 1647 __ pop(rbp); 1648 __ ret(0); 1649 1650 __ bind(L_pinned); 1651 1652 // Pinned, return to caller 1653 1654 // handle pending exception thrown by freeze 1655 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 1656 Label ok; 1657 __ jcc(Assembler::equal, ok); 1658 __ leave(); 1659 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 1660 __ bind(ok); 1661 1662 __ leave(); 1663 __ ret(0); 1664 } 1665 1666 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) { 1667 ::continuation_enter_cleanup(masm); 1668 } 1669 1670 static void gen_special_dispatch(MacroAssembler* masm, 1671 const methodHandle& method, 1672 const BasicType* sig_bt, 1673 const VMRegPair* regs) { 1674 verify_oop_args(masm, method, sig_bt, regs); 1675 vmIntrinsics::ID iid = method->intrinsic_id(); 1676 1677 // Now write the args into the outgoing interpreter space 1678 bool has_receiver = false; 1679 Register receiver_reg = noreg; 1680 int member_arg_pos = -1; 1681 Register member_reg = noreg; 1682 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid); 1683 if (ref_kind != 0) { 1684 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument 1685 member_reg = rbx; // known to be free at this point 1686 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind); 1687 } else if (iid == vmIntrinsics::_invokeBasic) { 1688 has_receiver = true; 1689 } else if (iid == vmIntrinsics::_linkToNative) { 1690 member_arg_pos = method->size_of_parameters() - 1; // trailing NativeEntryPoint argument 1691 member_reg = rbx; // known to be free at this point 1692 } else { 1693 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid)); 1694 } 1695 1696 if (member_reg != noreg) { 1697 // Load the member_arg into register, if necessary. 1698 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs); 1699 VMReg r = regs[member_arg_pos].first(); 1700 if (r->is_stack()) { 1701 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1702 } else { 1703 // no data motion is needed 1704 member_reg = r->as_Register(); 1705 } 1706 } 1707 1708 if (has_receiver) { 1709 // Make sure the receiver is loaded into a register. 1710 assert(method->size_of_parameters() > 0, "oob"); 1711 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object"); 1712 VMReg r = regs[0].first(); 1713 assert(r->is_valid(), "bad receiver arg"); 1714 if (r->is_stack()) { 1715 // Porting note: This assumes that compiled calling conventions always 1716 // pass the receiver oop in a register. If this is not true on some 1717 // platform, pick a temp and load the receiver from stack. 1718 fatal("receiver always in a register"); 1719 receiver_reg = j_rarg0; // known to be free at this point 1720 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1721 } else { 1722 // no data motion is needed 1723 receiver_reg = r->as_Register(); 1724 } 1725 } 1726 1727 // Figure out which address we are really jumping to: 1728 MethodHandles::generate_method_handle_dispatch(masm, iid, 1729 receiver_reg, member_reg, /*for_compiler_entry:*/ true); 1730 } 1731 1732 // --------------------------------------------------------------------------- 1733 // Generate a native wrapper for a given method. The method takes arguments 1734 // in the Java compiled code convention, marshals them to the native 1735 // convention (handlizes oops, etc), transitions to native, makes the call, 1736 // returns to java state (possibly blocking), unhandlizes any result and 1737 // returns. 1738 // 1739 // Critical native functions are a shorthand for the use of 1740 // GetPrimtiveArrayCritical and disallow the use of any other JNI 1741 // functions. The wrapper is expected to unpack the arguments before 1742 // passing them to the callee. Critical native functions leave the state _in_Java, 1743 // since they cannot stop for GC. 1744 // Some other parts of JNI setup are skipped like the tear down of the JNI handle 1745 // block and the check for pending exceptions it's impossible for them 1746 // to be thrown. 1747 // 1748 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm, 1749 const methodHandle& method, 1750 int compile_id, 1751 BasicType* in_sig_bt, 1752 VMRegPair* in_regs, 1753 BasicType ret_type) { 1754 if (method->is_continuation_native_intrinsic()) { 1755 int exception_offset = -1; 1756 OopMapSet* oop_maps = new OopMapSet(); 1757 int frame_complete = -1; 1758 int stack_slots = -1; 1759 int interpreted_entry_offset = -1; 1760 int vep_offset = -1; 1761 if (method->is_continuation_enter_intrinsic()) { 1762 gen_continuation_enter(masm, 1763 in_regs, 1764 exception_offset, 1765 oop_maps, 1766 frame_complete, 1767 stack_slots, 1768 interpreted_entry_offset, 1769 vep_offset); 1770 } else if (method->is_continuation_yield_intrinsic()) { 1771 gen_continuation_yield(masm, 1772 in_regs, 1773 oop_maps, 1774 frame_complete, 1775 stack_slots, 1776 vep_offset); 1777 } else { 1778 guarantee(false, "Unknown Continuation native intrinsic"); 1779 } 1780 1781 #ifdef ASSERT 1782 if (method->is_continuation_enter_intrinsic()) { 1783 assert(interpreted_entry_offset != -1, "Must be set"); 1784 assert(exception_offset != -1, "Must be set"); 1785 } else { 1786 assert(interpreted_entry_offset == -1, "Must be unset"); 1787 assert(exception_offset == -1, "Must be unset"); 1788 } 1789 assert(frame_complete != -1, "Must be set"); 1790 assert(stack_slots != -1, "Must be set"); 1791 assert(vep_offset != -1, "Must be set"); 1792 #endif 1793 1794 __ flush(); 1795 nmethod* nm = nmethod::new_native_nmethod(method, 1796 compile_id, 1797 masm->code(), 1798 vep_offset, 1799 frame_complete, 1800 stack_slots, 1801 in_ByteSize(-1), 1802 in_ByteSize(-1), 1803 oop_maps, 1804 exception_offset); 1805 if (nm == nullptr) return nm; 1806 if (method->is_continuation_enter_intrinsic()) { 1807 ContinuationEntry::set_enter_code(nm, interpreted_entry_offset); 1808 } else if (method->is_continuation_yield_intrinsic()) { 1809 ContinuationEntry::set_yield_code(nm); 1810 } 1811 return nm; 1812 } 1813 1814 if (method->is_method_handle_intrinsic()) { 1815 vmIntrinsics::ID iid = method->intrinsic_id(); 1816 intptr_t start = (intptr_t)__ pc(); 1817 int vep_offset = ((intptr_t)__ pc()) - start; 1818 gen_special_dispatch(masm, 1819 method, 1820 in_sig_bt, 1821 in_regs); 1822 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period 1823 __ flush(); 1824 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually 1825 return nmethod::new_native_nmethod(method, 1826 compile_id, 1827 masm->code(), 1828 vep_offset, 1829 frame_complete, 1830 stack_slots / VMRegImpl::slots_per_word, 1831 in_ByteSize(-1), 1832 in_ByteSize(-1), 1833 nullptr); 1834 } 1835 address native_func = method->native_function(); 1836 assert(native_func != nullptr, "must have function"); 1837 1838 // An OopMap for lock (and class if static) 1839 OopMapSet *oop_maps = new OopMapSet(); 1840 intptr_t start = (intptr_t)__ pc(); 1841 1842 // We have received a description of where all the java arg are located 1843 // on entry to the wrapper. We need to convert these args to where 1844 // the jni function will expect them. To figure out where they go 1845 // we convert the java signature to a C signature by inserting 1846 // the hidden arguments as arg[0] and possibly arg[1] (static method) 1847 1848 const int total_in_args = method->size_of_parameters(); 1849 int total_c_args = total_in_args + (method->is_static() ? 2 : 1); 1850 1851 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args); 1852 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args); 1853 1854 int argc = 0; 1855 out_sig_bt[argc++] = T_ADDRESS; 1856 if (method->is_static()) { 1857 out_sig_bt[argc++] = T_OBJECT; 1858 } 1859 1860 for (int i = 0; i < total_in_args ; i++ ) { 1861 out_sig_bt[argc++] = in_sig_bt[i]; 1862 } 1863 1864 // Now figure out where the args must be stored and how much stack space 1865 // they require. 1866 int out_arg_slots; 1867 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args); 1868 1869 // Compute framesize for the wrapper. We need to handlize all oops in 1870 // incoming registers 1871 1872 // Calculate the total number of stack slots we will need. 1873 1874 // First count the abi requirement plus all of the outgoing args 1875 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots; 1876 1877 // Now the space for the inbound oop handle area 1878 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers 1879 1880 int oop_handle_offset = stack_slots; 1881 stack_slots += total_save_slots; 1882 1883 // Now any space we need for handlizing a klass if static method 1884 1885 int klass_slot_offset = 0; 1886 int klass_offset = -1; 1887 int lock_slot_offset = 0; 1888 bool is_static = false; 1889 1890 if (method->is_static()) { 1891 klass_slot_offset = stack_slots; 1892 stack_slots += VMRegImpl::slots_per_word; 1893 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size; 1894 is_static = true; 1895 } 1896 1897 // Plus a lock if needed 1898 1899 if (method->is_synchronized()) { 1900 lock_slot_offset = stack_slots; 1901 stack_slots += VMRegImpl::slots_per_word; 1902 } 1903 1904 // Now a place (+2) to save return values or temp during shuffling 1905 // + 4 for return address (which we own) and saved rbp 1906 stack_slots += 6; 1907 1908 // Ok The space we have allocated will look like: 1909 // 1910 // 1911 // FP-> | | 1912 // |---------------------| 1913 // | 2 slots for moves | 1914 // |---------------------| 1915 // | lock box (if sync) | 1916 // |---------------------| <- lock_slot_offset 1917 // | klass (if static) | 1918 // |---------------------| <- klass_slot_offset 1919 // | oopHandle area | 1920 // |---------------------| <- oop_handle_offset (6 java arg registers) 1921 // | outbound memory | 1922 // | based arguments | 1923 // | | 1924 // |---------------------| 1925 // | | 1926 // SP-> | out_preserved_slots | 1927 // 1928 // 1929 1930 1931 // Now compute actual number of stack words we need rounding to make 1932 // stack properly aligned. 1933 stack_slots = align_up(stack_slots, StackAlignmentInSlots); 1934 1935 int stack_size = stack_slots * VMRegImpl::stack_slot_size; 1936 1937 // First thing make an ic check to see if we should even be here 1938 1939 // We are free to use all registers as temps without saving them and 1940 // restoring them except rbp. rbp is the only callee save register 1941 // as far as the interpreter and the compiler(s) are concerned. 1942 1943 const Register receiver = j_rarg0; 1944 1945 Label exception_pending; 1946 1947 assert_different_registers(receiver, rscratch1, rscratch2); 1948 __ verify_oop(receiver); 1949 __ ic_check(8 /* end_alignment */); 1950 1951 int vep_offset = ((intptr_t)__ pc()) - start; 1952 1953 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) { 1954 Label L_skip_barrier; 1955 Register klass = r10; 1956 __ mov_metadata(klass, method->method_holder()); // InstanceKlass* 1957 __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/); 1958 1959 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 1960 1961 __ bind(L_skip_barrier); 1962 } 1963 1964 #ifdef COMPILER1 1965 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available. 1966 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) { 1967 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/); 1968 } 1969 #endif // COMPILER1 1970 1971 // The instruction at the verified entry point must be 5 bytes or longer 1972 // because it can be patched on the fly by make_non_entrant. The stack bang 1973 // instruction fits that requirement. 1974 1975 // Generate stack overflow check 1976 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size()); 1977 1978 // Generate a new frame for the wrapper. 1979 __ enter(); 1980 // -2 because return address is already present and so is saved rbp 1981 __ subptr(rsp, stack_size - 2*wordSize); 1982 1983 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1984 // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub 1985 bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */); 1986 1987 // Frame is now completed as far as size and linkage. 1988 int frame_complete = ((intptr_t)__ pc()) - start; 1989 1990 #ifdef ASSERT 1991 __ check_stack_alignment(rsp, "improperly aligned stack"); 1992 #endif /* ASSERT */ 1993 1994 1995 // We use r14 as the oop handle for the receiver/klass 1996 // It is callee save so it survives the call to native 1997 1998 const Register oop_handle_reg = r14; 1999 2000 // 2001 // We immediately shuffle the arguments so that any vm call we have to 2002 // make from here on out (sync slow path, jvmti, etc.) we will have 2003 // captured the oops from our caller and have a valid oopMap for 2004 // them. 2005 2006 // ----------------- 2007 // The Grand Shuffle 2008 2009 // The Java calling convention is either equal (linux) or denser (win64) than the 2010 // c calling convention. However the because of the jni_env argument the c calling 2011 // convention always has at least one more (and two for static) arguments than Java. 2012 // Therefore if we move the args from java -> c backwards then we will never have 2013 // a register->register conflict and we don't have to build a dependency graph 2014 // and figure out how to break any cycles. 2015 // 2016 2017 // Record esp-based slot for receiver on stack for non-static methods 2018 int receiver_offset = -1; 2019 2020 // This is a trick. We double the stack slots so we can claim 2021 // the oops in the caller's frame. Since we are sure to have 2022 // more args than the caller doubling is enough to make 2023 // sure we can capture all the incoming oop args from the 2024 // caller. 2025 // 2026 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/); 2027 2028 // Mark location of rbp (someday) 2029 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp)); 2030 2031 // Use eax, ebx as temporaries during any memory-memory moves we have to do 2032 // All inbound args are referenced based on rbp and all outbound args via rsp. 2033 2034 2035 #ifdef ASSERT 2036 bool reg_destroyed[Register::number_of_registers]; 2037 bool freg_destroyed[XMMRegister::number_of_registers]; 2038 for ( int r = 0 ; r < Register::number_of_registers ; r++ ) { 2039 reg_destroyed[r] = false; 2040 } 2041 for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) { 2042 freg_destroyed[f] = false; 2043 } 2044 2045 #endif /* ASSERT */ 2046 2047 // For JNI natives the incoming and outgoing registers are offset upwards. 2048 GrowableArray<int> arg_order(2 * total_in_args); 2049 2050 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 2051 arg_order.push(i); 2052 arg_order.push(c_arg); 2053 } 2054 2055 for (int ai = 0; ai < arg_order.length(); ai += 2) { 2056 int i = arg_order.at(ai); 2057 int c_arg = arg_order.at(ai + 1); 2058 __ block_comment(err_msg("move %d -> %d", i, c_arg)); 2059 #ifdef ASSERT 2060 if (in_regs[i].first()->is_Register()) { 2061 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!"); 2062 } else if (in_regs[i].first()->is_XMMRegister()) { 2063 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!"); 2064 } 2065 if (out_regs[c_arg].first()->is_Register()) { 2066 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 2067 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 2068 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 2069 } 2070 #endif /* ASSERT */ 2071 switch (in_sig_bt[i]) { 2072 case T_ARRAY: 2073 case T_OBJECT: 2074 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg], 2075 ((i == 0) && (!is_static)), 2076 &receiver_offset); 2077 break; 2078 case T_VOID: 2079 break; 2080 2081 case T_FLOAT: 2082 __ float_move(in_regs[i], out_regs[c_arg]); 2083 break; 2084 2085 case T_DOUBLE: 2086 assert( i + 1 < total_in_args && 2087 in_sig_bt[i + 1] == T_VOID && 2088 out_sig_bt[c_arg+1] == T_VOID, "bad arg list"); 2089 __ double_move(in_regs[i], out_regs[c_arg]); 2090 break; 2091 2092 case T_LONG : 2093 __ long_move(in_regs[i], out_regs[c_arg]); 2094 break; 2095 2096 case T_ADDRESS: assert(false, "found T_ADDRESS in java args"); 2097 2098 default: 2099 __ move32_64(in_regs[i], out_regs[c_arg]); 2100 } 2101 } 2102 2103 int c_arg; 2104 2105 // Pre-load a static method's oop into r14. Used both by locking code and 2106 // the normal JNI call code. 2107 // point c_arg at the first arg that is already loaded in case we 2108 // need to spill before we call out 2109 c_arg = total_c_args - total_in_args; 2110 2111 if (method->is_static()) { 2112 2113 // load oop into a register 2114 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror())); 2115 2116 // Now handlize the static class mirror it's known not-null. 2117 __ movptr(Address(rsp, klass_offset), oop_handle_reg); 2118 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset)); 2119 2120 // Now get the handle 2121 __ lea(oop_handle_reg, Address(rsp, klass_offset)); 2122 // store the klass handle as second argument 2123 __ movptr(c_rarg1, oop_handle_reg); 2124 // and protect the arg if we must spill 2125 c_arg--; 2126 } 2127 2128 // Change state to native (we save the return address in the thread, since it might not 2129 // be pushed on the stack when we do a stack traversal). It is enough that the pc() 2130 // points into the right code segment. It does not have to be the correct return pc. 2131 // We use the same pc/oopMap repeatedly when we call out 2132 2133 Label native_return; 2134 if (method->is_object_wait0()) { 2135 // For convenience we use the pc we want to resume to in case of preemption on Object.wait. 2136 __ set_last_Java_frame(rsp, noreg, native_return, rscratch1); 2137 } else { 2138 intptr_t the_pc = (intptr_t) __ pc(); 2139 oop_maps->add_gc_map(the_pc - start, map); 2140 2141 __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1); 2142 } 2143 2144 // We have all of the arguments setup at this point. We must not touch any register 2145 // argument registers at this point (what if we save/restore them there are no oop? 2146 2147 if (DTraceMethodProbes) { 2148 // protect the args we've loaded 2149 save_args(masm, total_c_args, c_arg, out_regs); 2150 __ mov_metadata(c_rarg1, method()); 2151 __ call_VM_leaf( 2152 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry), 2153 r15_thread, c_rarg1); 2154 restore_args(masm, total_c_args, c_arg, out_regs); 2155 } 2156 2157 // RedefineClasses() tracing support for obsolete method entry 2158 if (log_is_enabled(Trace, redefine, class, obsolete)) { 2159 // protect the args we've loaded 2160 save_args(masm, total_c_args, c_arg, out_regs); 2161 __ mov_metadata(c_rarg1, method()); 2162 __ call_VM_leaf( 2163 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry), 2164 r15_thread, c_rarg1); 2165 restore_args(masm, total_c_args, c_arg, out_regs); 2166 } 2167 2168 // Lock a synchronized method 2169 2170 // Register definitions used by locking and unlocking 2171 2172 const Register swap_reg = rax; // Must use rax for cmpxchg instruction 2173 const Register obj_reg = rbx; // Will contain the oop 2174 const Register lock_reg = r13; // Address of compiler lock object (BasicLock) 2175 2176 Label slow_path_lock; 2177 Label lock_done; 2178 2179 if (method->is_synchronized()) { 2180 // Get the handle (the 2nd argument) 2181 __ mov(oop_handle_reg, c_rarg1); 2182 2183 // Get address of the box 2184 2185 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2186 2187 // Load the oop from the handle 2188 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2189 2190 __ lightweight_lock(lock_reg, obj_reg, swap_reg, rscratch1, slow_path_lock); 2191 2192 // Slow path will re-enter here 2193 __ bind(lock_done); 2194 } 2195 2196 // Finally just about ready to make the JNI call 2197 2198 // get JNIEnv* which is first argument to native 2199 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset()))); 2200 2201 // Now set thread in native 2202 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 2203 2204 __ call(RuntimeAddress(native_func)); 2205 2206 // Verify or restore cpu control state after JNI call 2207 __ restore_cpu_control_state_after_jni(rscratch1); 2208 2209 // Unpack native results. 2210 switch (ret_type) { 2211 case T_BOOLEAN: __ c2bool(rax); break; 2212 case T_CHAR : __ movzwl(rax, rax); break; 2213 case T_BYTE : __ sign_extend_byte (rax); break; 2214 case T_SHORT : __ sign_extend_short(rax); break; 2215 case T_INT : /* nothing to do */ break; 2216 case T_DOUBLE : 2217 case T_FLOAT : 2218 // Result is in xmm0 we'll save as needed 2219 break; 2220 case T_ARRAY: // Really a handle 2221 case T_OBJECT: // Really a handle 2222 break; // can't de-handlize until after safepoint check 2223 case T_VOID: break; 2224 case T_LONG: break; 2225 default : ShouldNotReachHere(); 2226 } 2227 2228 // Switch thread to "native transition" state before reading the synchronization state. 2229 // This additional state is necessary because reading and testing the synchronization 2230 // state is not atomic w.r.t. GC, as this scenario demonstrates: 2231 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted. 2232 // VM thread changes sync state to synchronizing and suspends threads for GC. 2233 // Thread A is resumed to finish this native method, but doesn't block here since it 2234 // didn't see any synchronization is progress, and escapes. 2235 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 2236 2237 // Force this write out before the read below 2238 if (!UseSystemMemoryBarrier) { 2239 __ membar(Assembler::Membar_mask_bits( 2240 Assembler::LoadLoad | Assembler::LoadStore | 2241 Assembler::StoreLoad | Assembler::StoreStore)); 2242 } 2243 2244 // check for safepoint operation in progress and/or pending suspend requests 2245 { 2246 Label Continue; 2247 Label slow_path; 2248 2249 __ safepoint_poll(slow_path, true /* at_return */, false /* in_nmethod */); 2250 2251 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2252 __ jcc(Assembler::equal, Continue); 2253 __ bind(slow_path); 2254 2255 // Don't use call_VM as it will see a possible pending exception and forward it 2256 // and never return here preventing us from clearing _last_native_pc down below. 2257 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are 2258 // preserved and correspond to the bcp/locals pointers. So we do a runtime call 2259 // by hand. 2260 // 2261 __ vzeroupper(); 2262 save_native_result(masm, ret_type, stack_slots); 2263 __ mov(c_rarg0, r15_thread); 2264 __ mov(r12, rsp); // remember sp 2265 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2266 __ andptr(rsp, -16); // align stack as required by ABI 2267 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 2268 __ mov(rsp, r12); // restore sp 2269 __ reinit_heapbase(); 2270 // Restore any method result value 2271 restore_native_result(masm, ret_type, stack_slots); 2272 __ bind(Continue); 2273 } 2274 2275 // change thread state 2276 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 2277 2278 if (method->is_object_wait0()) { 2279 // Check preemption for Object.wait() 2280 __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset())); 2281 __ cmpptr(rscratch1, NULL_WORD); 2282 __ jccb(Assembler::equal, native_return); 2283 __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD); 2284 __ jmp(rscratch1); 2285 __ bind(native_return); 2286 2287 intptr_t the_pc = (intptr_t) __ pc(); 2288 oop_maps->add_gc_map(the_pc - start, map); 2289 } 2290 2291 2292 Label reguard; 2293 Label reguard_done; 2294 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 2295 __ jcc(Assembler::equal, reguard); 2296 __ bind(reguard_done); 2297 2298 // native result if any is live 2299 2300 // Unlock 2301 Label slow_path_unlock; 2302 Label unlock_done; 2303 if (method->is_synchronized()) { 2304 2305 Label fast_done; 2306 2307 // Get locked oop from the handle we passed to jni 2308 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2309 2310 // Must save rax if it is live now because cmpxchg must use it 2311 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2312 save_native_result(masm, ret_type, stack_slots); 2313 } 2314 2315 __ lightweight_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock); 2316 2317 // slow path re-enters here 2318 __ bind(unlock_done); 2319 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2320 restore_native_result(masm, ret_type, stack_slots); 2321 } 2322 2323 __ bind(fast_done); 2324 } 2325 if (DTraceMethodProbes) { 2326 save_native_result(masm, ret_type, stack_slots); 2327 __ mov_metadata(c_rarg1, method()); 2328 __ call_VM_leaf( 2329 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit), 2330 r15_thread, c_rarg1); 2331 restore_native_result(masm, ret_type, stack_slots); 2332 } 2333 2334 __ reset_last_Java_frame(false); 2335 2336 // Unbox oop result, e.g. JNIHandles::resolve value. 2337 if (is_reference_type(ret_type)) { 2338 __ resolve_jobject(rax /* value */, 2339 rcx /* tmp */); 2340 } 2341 2342 if (CheckJNICalls) { 2343 // clear_pending_jni_exception_check 2344 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD); 2345 } 2346 2347 // reset handle block 2348 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset())); 2349 __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD); 2350 2351 // pop our frame 2352 2353 __ leave(); 2354 2355 #if INCLUDE_JFR 2356 // We need to do a poll test after unwind in case the sampler 2357 // managed to sample the native frame after returning to Java. 2358 Label L_return; 2359 address poll_test_pc = __ pc(); 2360 __ relocate(relocInfo::poll_return_type); 2361 __ testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit()); 2362 __ jccb(Assembler::zero, L_return); 2363 __ lea(rscratch1, InternalAddress(poll_test_pc)); 2364 __ movptr(Address(r15_thread, JavaThread::saved_exception_pc_offset()), rscratch1); 2365 assert(SharedRuntime::polling_page_return_handler_blob() != nullptr, 2366 "polling page return stub not created yet"); 2367 address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point(); 2368 __ jump(RuntimeAddress(stub)); 2369 __ bind(L_return); 2370 #endif // INCLUDE_JFR 2371 2372 // Any exception pending? 2373 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2374 __ jcc(Assembler::notEqual, exception_pending); 2375 2376 // Return 2377 2378 __ ret(0); 2379 2380 // Unexpected paths are out of line and go here 2381 2382 // forward the exception 2383 __ bind(exception_pending); 2384 2385 // and forward the exception 2386 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 2387 2388 // Slow path locking & unlocking 2389 if (method->is_synchronized()) { 2390 2391 // BEGIN Slow path lock 2392 __ bind(slow_path_lock); 2393 2394 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM 2395 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2396 2397 // protect the args we've loaded 2398 save_args(masm, total_c_args, c_arg, out_regs); 2399 2400 __ mov(c_rarg0, obj_reg); 2401 __ mov(c_rarg1, lock_reg); 2402 __ mov(c_rarg2, r15_thread); 2403 2404 // Not a leaf but we have last_Java_frame setup as we want. 2405 // We don't want to unmount in case of contention since that would complicate preserving 2406 // the arguments that had already been marshalled into the native convention. So we force 2407 // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame()) 2408 // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack. 2409 __ push_cont_fastpath(); 2410 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3); 2411 __ pop_cont_fastpath(); 2412 restore_args(masm, total_c_args, c_arg, out_regs); 2413 2414 #ifdef ASSERT 2415 { Label L; 2416 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2417 __ jcc(Assembler::equal, L); 2418 __ stop("no pending exception allowed on exit from monitorenter"); 2419 __ bind(L); 2420 } 2421 #endif 2422 __ jmp(lock_done); 2423 2424 // END Slow path lock 2425 2426 // BEGIN Slow path unlock 2427 __ bind(slow_path_unlock); 2428 2429 // If we haven't already saved the native result we must save it now as xmm registers 2430 // are still exposed. 2431 __ vzeroupper(); 2432 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2433 save_native_result(masm, ret_type, stack_slots); 2434 } 2435 2436 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2437 2438 __ mov(c_rarg0, obj_reg); 2439 __ mov(c_rarg2, r15_thread); 2440 __ mov(r12, rsp); // remember sp 2441 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2442 __ andptr(rsp, -16); // align stack as required by ABI 2443 2444 // Save pending exception around call to VM (which contains an EXCEPTION_MARK) 2445 // NOTE that obj_reg == rbx currently 2446 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset()))); 2447 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2448 2449 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2450 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C))); 2451 __ mov(rsp, r12); // restore sp 2452 __ reinit_heapbase(); 2453 #ifdef ASSERT 2454 { 2455 Label L; 2456 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2457 __ jcc(Assembler::equal, L); 2458 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C"); 2459 __ bind(L); 2460 } 2461 #endif /* ASSERT */ 2462 2463 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx); 2464 2465 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2466 restore_native_result(masm, ret_type, stack_slots); 2467 } 2468 __ jmp(unlock_done); 2469 2470 // END Slow path unlock 2471 2472 } // synchronized 2473 2474 // SLOW PATH Reguard the stack if needed 2475 2476 __ bind(reguard); 2477 __ vzeroupper(); 2478 save_native_result(masm, ret_type, stack_slots); 2479 __ mov(r12, rsp); // remember sp 2480 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2481 __ andptr(rsp, -16); // align stack as required by ABI 2482 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 2483 __ mov(rsp, r12); // restore sp 2484 __ reinit_heapbase(); 2485 restore_native_result(masm, ret_type, stack_slots); 2486 // and continue 2487 __ jmp(reguard_done); 2488 2489 2490 2491 __ flush(); 2492 2493 nmethod *nm = nmethod::new_native_nmethod(method, 2494 compile_id, 2495 masm->code(), 2496 vep_offset, 2497 frame_complete, 2498 stack_slots / VMRegImpl::slots_per_word, 2499 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)), 2500 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size), 2501 oop_maps); 2502 2503 return nm; 2504 } 2505 2506 // this function returns the adjust size (in number of words) to a c2i adapter 2507 // activation for use during deoptimization 2508 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) { 2509 return (callee_locals - callee_parameters) * Interpreter::stackElementWords; 2510 } 2511 2512 2513 uint SharedRuntime::out_preserve_stack_slots() { 2514 return 0; 2515 } 2516 2517 2518 // Number of stack slots between incoming argument block and the start of 2519 // a new frame. The PROLOG must add this many slots to the stack. The 2520 // EPILOG must remove this many slots. amd64 needs two slots for 2521 // return address. 2522 uint SharedRuntime::in_preserve_stack_slots() { 2523 return 4 + 2 * VerifyStackAtCalls; 2524 } 2525 2526 VMReg SharedRuntime::thread_register() { 2527 return r15_thread->as_VMReg(); 2528 } 2529 2530 //------------------------------generate_deopt_blob---------------------------- 2531 void SharedRuntime::generate_deopt_blob() { 2532 // Allocate space for the code 2533 ResourceMark rm; 2534 // Setup code generation tools 2535 int pad = 0; 2536 if (UseAVX > 2) { 2537 pad += 1024; 2538 } 2539 if (UseAPX) { 2540 pad += 1024; 2541 } 2542 #if INCLUDE_JVMCI 2543 if (EnableJVMCI) { 2544 pad += 512; // Increase the buffer size when compiling for JVMCI 2545 } 2546 #endif 2547 const char* name = SharedRuntime::stub_name(StubId::shared_deopt_id); 2548 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id); 2549 if (blob != nullptr) { 2550 _deopt_blob = blob->as_deoptimization_blob(); 2551 return; 2552 } 2553 2554 CodeBuffer buffer(name, 2560+pad, 1024); 2555 MacroAssembler* masm = new MacroAssembler(&buffer); 2556 int frame_size_in_words; 2557 OopMap* map = nullptr; 2558 OopMapSet *oop_maps = new OopMapSet(); 2559 2560 // ------------- 2561 // This code enters when returning to a de-optimized nmethod. A return 2562 // address has been pushed on the stack, and return values are in 2563 // registers. 2564 // If we are doing a normal deopt then we were called from the patched 2565 // nmethod from the point we returned to the nmethod. So the return 2566 // address on the stack is wrong by NativeCall::instruction_size 2567 // We will adjust the value so it looks like we have the original return 2568 // address on the stack (like when we eagerly deoptimized). 2569 // In the case of an exception pending when deoptimizing, we enter 2570 // with a return address on the stack that points after the call we patched 2571 // into the exception handler. We have the following register state from, 2572 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp). 2573 // rax: exception oop 2574 // rbx: exception handler 2575 // rdx: throwing pc 2576 // So in this case we simply jam rdx into the useless return address and 2577 // the stack looks just like we want. 2578 // 2579 // At this point we need to de-opt. We save the argument return 2580 // registers. We call the first C routine, fetch_unroll_info(). This 2581 // routine captures the return values and returns a structure which 2582 // describes the current frame size and the sizes of all replacement frames. 2583 // The current frame is compiled code and may contain many inlined 2584 // functions, each with their own JVM state. We pop the current frame, then 2585 // push all the new frames. Then we call the C routine unpack_frames() to 2586 // populate these frames. Finally unpack_frames() returns us the new target 2587 // address. Notice that callee-save registers are BLOWN here; they have 2588 // already been captured in the vframeArray at the time the return PC was 2589 // patched. 2590 address start = __ pc(); 2591 Label cont; 2592 2593 // Prolog for non exception case! 2594 2595 // Save everything in sight. 2596 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2597 2598 // Normal deoptimization. Save exec mode for unpack_frames. 2599 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved 2600 __ jmp(cont); 2601 2602 int reexecute_offset = __ pc() - start; 2603 #if INCLUDE_JVMCI && !defined(COMPILER1) 2604 if (UseJVMCICompiler) { 2605 // JVMCI does not use this kind of deoptimization 2606 __ should_not_reach_here(); 2607 } 2608 #endif 2609 2610 // Reexecute case 2611 // return address is the pc describes what bci to do re-execute at 2612 2613 // No need to update map as each call to save_live_registers will produce identical oopmap 2614 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2615 2616 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved 2617 __ jmp(cont); 2618 2619 #if INCLUDE_JVMCI 2620 Label after_fetch_unroll_info_call; 2621 int implicit_exception_uncommon_trap_offset = 0; 2622 int uncommon_trap_offset = 0; 2623 2624 if (EnableJVMCI) { 2625 implicit_exception_uncommon_trap_offset = __ pc() - start; 2626 2627 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset()))); 2628 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD); 2629 2630 uncommon_trap_offset = __ pc() - start; 2631 2632 // Save everything in sight. 2633 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2634 // fetch_unroll_info needs to call last_java_frame() 2635 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2636 2637 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset()))); 2638 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1); 2639 2640 __ movl(r14, Deoptimization::Unpack_reexecute); 2641 __ mov(c_rarg0, r15_thread); 2642 __ movl(c_rarg2, r14); // exec mode 2643 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2644 oop_maps->add_gc_map( __ pc()-start, map->deep_copy()); 2645 2646 __ reset_last_Java_frame(false); 2647 2648 __ jmp(after_fetch_unroll_info_call); 2649 } // EnableJVMCI 2650 #endif // INCLUDE_JVMCI 2651 2652 int exception_offset = __ pc() - start; 2653 2654 // Prolog for exception case 2655 2656 // all registers are dead at this entry point, except for rax, and 2657 // rdx which contain the exception oop and exception pc 2658 // respectively. Set them in TLS and fall thru to the 2659 // unpack_with_exception_in_tls entry point. 2660 2661 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 2662 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax); 2663 2664 int exception_in_tls_offset = __ pc() - start; 2665 2666 // new implementation because exception oop is now passed in JavaThread 2667 2668 // Prolog for exception case 2669 // All registers must be preserved because they might be used by LinearScan 2670 // Exceptiop oop and throwing PC are passed in JavaThread 2671 // tos: stack at point of call to method that threw the exception (i.e. only 2672 // args are on the stack, no return address) 2673 2674 // make room on stack for the return address 2675 // It will be patched later with the throwing pc. The correct value is not 2676 // available now because loading it from memory would destroy registers. 2677 __ push(0); 2678 2679 // Save everything in sight. 2680 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2681 2682 // Now it is safe to overwrite any register 2683 2684 // Deopt during an exception. Save exec mode for unpack_frames. 2685 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved 2686 2687 // load throwing pc from JavaThread and patch it as the return address 2688 // of the current frame. Then clear the field in JavaThread 2689 2690 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2691 __ movptr(Address(rbp, wordSize), rdx); 2692 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2693 2694 #ifdef ASSERT 2695 // verify that there is really an exception oop in JavaThread 2696 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2697 __ verify_oop(rax); 2698 2699 // verify that there is no pending exception 2700 Label no_pending_exception; 2701 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 2702 __ testptr(rax, rax); 2703 __ jcc(Assembler::zero, no_pending_exception); 2704 __ stop("must not have pending exception here"); 2705 __ bind(no_pending_exception); 2706 #endif 2707 2708 __ bind(cont); 2709 2710 // Call C code. Need thread and this frame, but NOT official VM entry 2711 // crud. We cannot block on this call, no GC can happen. 2712 // 2713 // UnrollBlock* fetch_unroll_info(JavaThread* thread) 2714 2715 // fetch_unroll_info needs to call last_java_frame(). 2716 2717 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2718 #ifdef ASSERT 2719 { Label L; 2720 __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD); 2721 __ jcc(Assembler::equal, L); 2722 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared"); 2723 __ bind(L); 2724 } 2725 #endif // ASSERT 2726 __ mov(c_rarg0, r15_thread); 2727 __ movl(c_rarg1, r14); // exec_mode 2728 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info))); 2729 2730 // Need to have an oopmap that tells fetch_unroll_info where to 2731 // find any register it might need. 2732 oop_maps->add_gc_map(__ pc() - start, map); 2733 2734 __ reset_last_Java_frame(false); 2735 2736 #if INCLUDE_JVMCI 2737 if (EnableJVMCI) { 2738 __ bind(after_fetch_unroll_info_call); 2739 } 2740 #endif 2741 2742 // Load UnrollBlock* into rdi 2743 __ mov(rdi, rax); 2744 2745 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset())); 2746 Label noException; 2747 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending? 2748 __ jcc(Assembler::notEqual, noException); 2749 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 2750 // QQQ this is useless it was null above 2751 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 2752 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD); 2753 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 2754 2755 __ verify_oop(rax); 2756 2757 // Overwrite the result registers with the exception results. 2758 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2759 // I think this is useless 2760 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx); 2761 2762 __ bind(noException); 2763 2764 // Only register save data is on the stack. 2765 // Now restore the result registers. Everything else is either dead 2766 // or captured in the vframeArray. 2767 RegisterSaver::restore_result_registers(masm); 2768 2769 // All of the register save area has been popped of the stack. Only the 2770 // return address remains. 2771 2772 // Pop all the frames we must move/replace. 2773 // 2774 // Frame picture (youngest to oldest) 2775 // 1: self-frame (no frame link) 2776 // 2: deopting frame (no frame link) 2777 // 3: caller of deopting frame (could be compiled/interpreted). 2778 // 2779 // Note: by leaving the return address of self-frame on the stack 2780 // and using the size of frame 2 to adjust the stack 2781 // when we are done the return to frame 3 will still be on the stack. 2782 2783 // Pop deoptimized frame 2784 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset())); 2785 __ addptr(rsp, rcx); 2786 2787 // rsp should be pointing at the return address to the caller (3) 2788 2789 // Pick up the initial fp we should save 2790 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 2791 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset())); 2792 2793 #ifdef ASSERT 2794 // Compilers generate code that bang the stack by as much as the 2795 // interpreter would need. So this stack banging should never 2796 // trigger a fault. Verify that it does not on non product builds. 2797 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset())); 2798 __ bang_stack_size(rbx, rcx); 2799 #endif 2800 2801 // Load address of array of frame pcs into rcx 2802 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset())); 2803 2804 // Trash the old pc 2805 __ addptr(rsp, wordSize); 2806 2807 // Load address of array of frame sizes into rsi 2808 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset())); 2809 2810 // Load counter into rdx 2811 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset())); 2812 2813 // Now adjust the caller's stack to make up for the extra locals 2814 // but record the original sp so that we can save it in the skeletal interpreter 2815 // frame and the stack walking of interpreter_sender will get the unextended sp 2816 // value and not the "real" sp value. 2817 2818 const Register sender_sp = r8; 2819 2820 __ mov(sender_sp, rsp); 2821 __ movl(rbx, Address(rdi, 2822 Deoptimization::UnrollBlock:: 2823 caller_adjustment_offset())); 2824 __ subptr(rsp, rbx); 2825 2826 // Push interpreter frames in a loop 2827 Label loop; 2828 __ bind(loop); 2829 __ movptr(rbx, Address(rsi, 0)); // Load frame size 2830 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand 2831 __ pushptr(Address(rcx, 0)); // Save return address 2832 __ enter(); // Save old & set new ebp 2833 __ subptr(rsp, rbx); // Prolog 2834 // This value is corrected by layout_activation_impl 2835 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD); 2836 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable 2837 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 2838 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 2839 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 2840 __ decrementl(rdx); // Decrement counter 2841 __ jcc(Assembler::notZero, loop); 2842 __ pushptr(Address(rcx, 0)); // Save final return address 2843 2844 // Re-push self-frame 2845 __ enter(); // Save old & set new ebp 2846 2847 // Allocate a full sized register save area. 2848 // Return address and rbp are in place, so we allocate two less words. 2849 __ subptr(rsp, (frame_size_in_words - 2) * wordSize); 2850 2851 // Restore frame locals after moving the frame 2852 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0); 2853 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 2854 2855 // Call C code. Need thread but NOT official VM entry 2856 // crud. We cannot block on this call, no GC can happen. Call should 2857 // restore return values to their stack-slots with the new SP. 2858 // 2859 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode) 2860 2861 // Use rbp because the frames look interpreted now 2862 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 2863 // Don't need the precise return PC here, just precise enough to point into this code blob. 2864 address the_pc = __ pc(); 2865 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1); 2866 2867 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI 2868 __ mov(c_rarg0, r15_thread); 2869 __ movl(c_rarg1, r14); // second arg: exec_mode 2870 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 2871 // Revert SP alignment after call since we're going to do some SP relative addressing below 2872 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset())); 2873 2874 // Set an oopmap for the call site 2875 // Use the same PC we used for the last java frame 2876 oop_maps->add_gc_map(the_pc - start, 2877 new OopMap( frame_size_in_words, 0 )); 2878 2879 // Clear fp AND pc 2880 __ reset_last_Java_frame(true); 2881 2882 // Collect return values 2883 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes())); 2884 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes())); 2885 // I think this is useless (throwing pc?) 2886 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes())); 2887 2888 // Pop self-frame. 2889 __ leave(); // Epilog 2890 2891 // Jump to interpreter 2892 __ ret(0); 2893 2894 // Make sure all code is generated 2895 masm->flush(); 2896 2897 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words); 2898 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset); 2899 #if INCLUDE_JVMCI 2900 if (EnableJVMCI) { 2901 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset); 2902 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset); 2903 } 2904 #endif 2905 2906 AOTCodeCache::store_code_blob(*_deopt_blob, AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id); 2907 } 2908 2909 //------------------------------generate_handler_blob------ 2910 // 2911 // Generate a special Compile2Runtime blob that saves all registers, 2912 // and setup oopmap. 2913 // 2914 SafepointBlob* SharedRuntime::generate_handler_blob(StubId id, address call_ptr) { 2915 assert(StubRoutines::forward_exception_entry() != nullptr, 2916 "must be generated before"); 2917 assert(is_polling_page_id(id), "expected a polling page stub id"); 2918 2919 // Allocate space for the code. Setup code generation tools. 2920 const char* name = SharedRuntime::stub_name(id); 2921 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id)); 2922 if (blob != nullptr) { 2923 return blob->as_safepoint_blob(); 2924 } 2925 2926 ResourceMark rm; 2927 OopMapSet *oop_maps = new OopMapSet(); 2928 OopMap* map; 2929 CodeBuffer buffer(name, 2548, 1024); 2930 MacroAssembler* masm = new MacroAssembler(&buffer); 2931 2932 address start = __ pc(); 2933 address call_pc = nullptr; 2934 int frame_size_in_words; 2935 bool cause_return = (id == StubId::shared_polling_page_return_handler_id); 2936 bool save_wide_vectors = (id == StubId::shared_polling_page_vectors_safepoint_handler_id); 2937 2938 // Make room for return address (or push it again) 2939 if (!cause_return) { 2940 __ push(rbx); 2941 } 2942 2943 // Save registers, fpu state, and flags 2944 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors); 2945 2946 // The following is basically a call_VM. However, we need the precise 2947 // address of the call in order to generate an oopmap. Hence, we do all the 2948 // work ourselves. 2949 2950 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next: 2951 2952 // The return address must always be correct so that frame constructor never 2953 // sees an invalid pc. 2954 2955 if (!cause_return) { 2956 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack. 2957 // Additionally, rbx is a callee saved register and we can look at it later to determine 2958 // if someone changed the return address for us! 2959 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset())); 2960 __ movptr(Address(rbp, wordSize), rbx); 2961 } 2962 2963 // Do the call 2964 __ mov(c_rarg0, r15_thread); 2965 __ call(RuntimeAddress(call_ptr)); 2966 2967 // Set an oopmap for the call site. This oopmap will map all 2968 // oop-registers and debug-info registers as callee-saved. This 2969 // will allow deoptimization at this safepoint to find all possible 2970 // debug-info recordings, as well as let GC find all oops. 2971 2972 oop_maps->add_gc_map( __ pc() - start, map); 2973 2974 Label noException; 2975 2976 __ reset_last_Java_frame(false); 2977 2978 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 2979 __ jcc(Assembler::equal, noException); 2980 2981 // Exception pending 2982 2983 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 2984 2985 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 2986 2987 // No exception case 2988 __ bind(noException); 2989 2990 Label no_adjust; 2991 #ifdef ASSERT 2992 Label bail; 2993 #endif 2994 if (!cause_return) { 2995 Label no_prefix, not_special, check_rex_prefix; 2996 2997 // If our stashed return pc was modified by the runtime we avoid touching it 2998 __ cmpptr(rbx, Address(rbp, wordSize)); 2999 __ jcc(Assembler::notEqual, no_adjust); 3000 3001 // Skip over the poll instruction. 3002 // See NativeInstruction::is_safepoint_poll() 3003 // Possible encodings: 3004 // 85 00 test %eax,(%rax) 3005 // 85 01 test %eax,(%rcx) 3006 // 85 02 test %eax,(%rdx) 3007 // 85 03 test %eax,(%rbx) 3008 // 85 06 test %eax,(%rsi) 3009 // 85 07 test %eax,(%rdi) 3010 // 3011 // 41 85 00 test %eax,(%r8) 3012 // 41 85 01 test %eax,(%r9) 3013 // 41 85 02 test %eax,(%r10) 3014 // 41 85 03 test %eax,(%r11) 3015 // 41 85 06 test %eax,(%r14) 3016 // 41 85 07 test %eax,(%r15) 3017 // 3018 // 85 04 24 test %eax,(%rsp) 3019 // 41 85 04 24 test %eax,(%r12) 3020 // 85 45 00 test %eax,0x0(%rbp) 3021 // 41 85 45 00 test %eax,0x0(%r13) 3022 // 3023 // Notes: 3024 // Format of legacy MAP0 test instruction:- 3025 // [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32] 3026 // o For safepoint polling instruction "test %eax,(%rax)", encoding of first register 3027 // operand and base register of memory operand is b/w [0-8), hence we do not require 3028 // additional REX prefix where REX.B bit stores MSB bit of register encoding, which 3029 // is why two bytes encoding is sufficient here. 3030 // o For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE 3031 // register of memory operand is 1000, thus we need additional REX prefix in this case, 3032 // there by adding additional byte to instruction encoding. 3033 // o In case BASE register is one of the 32 extended GPR registers available only on targets 3034 // supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold 3035 // most significant two bits of 5 bit register encoding. 3036 3037 if (VM_Version::supports_apx_f()) { 3038 __ cmpb(Address(rbx, 0), Assembler::REX2); 3039 __ jccb(Assembler::notEqual, check_rex_prefix); 3040 __ addptr(rbx, 2); 3041 __ bind(check_rex_prefix); 3042 } 3043 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix); 3044 __ jccb(Assembler::notEqual, no_prefix); 3045 __ addptr(rbx, 1); 3046 __ bind(no_prefix); 3047 #ifdef ASSERT 3048 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below 3049 #endif 3050 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values: 3051 // r12/rsp 0x04 3052 // r13/rbp 0x05 3053 __ movzbq(rcx, Address(rbx, 1)); 3054 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05 3055 __ subptr(rcx, 4); // looking for 0x00 .. 0x01 3056 __ cmpptr(rcx, 1); 3057 __ jccb(Assembler::above, not_special); 3058 __ addptr(rbx, 1); 3059 __ bind(not_special); 3060 #ifdef ASSERT 3061 // Verify the correct encoding of the poll we're about to skip. 3062 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl); 3063 __ jcc(Assembler::notEqual, bail); 3064 // Mask out the modrm bits 3065 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask); 3066 // rax encodes to 0, so if the bits are nonzero it's incorrect 3067 __ jcc(Assembler::notZero, bail); 3068 #endif 3069 // Adjust return pc forward to step over the safepoint poll instruction 3070 __ addptr(rbx, 2); 3071 __ movptr(Address(rbp, wordSize), rbx); 3072 } 3073 3074 __ bind(no_adjust); 3075 // Normal exit, restore registers and exit. 3076 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3077 __ ret(0); 3078 3079 #ifdef ASSERT 3080 __ bind(bail); 3081 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected"); 3082 #endif 3083 3084 // Make sure all code is generated 3085 masm->flush(); 3086 3087 // Fill-out other meta info 3088 SafepointBlob* sp_blob = SafepointBlob::create(&buffer, oop_maps, frame_size_in_words); 3089 3090 AOTCodeCache::store_code_blob(*sp_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id)); 3091 return sp_blob; 3092 } 3093 3094 // 3095 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss 3096 // 3097 // Generate a stub that calls into vm to find out the proper destination 3098 // of a java call. All the argument registers are live at this point 3099 // but since this is generic code we don't know what they are and the caller 3100 // must do any gc of the args. 3101 // 3102 RuntimeStub* SharedRuntime::generate_resolve_blob(StubId id, address destination) { 3103 assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before"); 3104 assert(is_resolve_id(id), "expected a resolve stub id"); 3105 3106 const char* name = SharedRuntime::stub_name(id); 3107 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id)); 3108 if (blob != nullptr) { 3109 return blob->as_runtime_stub(); 3110 } 3111 3112 // allocate space for the code 3113 ResourceMark rm; 3114 CodeBuffer buffer(name, 1552, 512); 3115 MacroAssembler* masm = new MacroAssembler(&buffer); 3116 3117 int frame_size_in_words; 3118 3119 OopMapSet *oop_maps = new OopMapSet(); 3120 OopMap* map = nullptr; 3121 3122 int start = __ offset(); 3123 3124 // No need to save vector registers since they are caller-saved anyway. 3125 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false); 3126 3127 int frame_complete = __ offset(); 3128 3129 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 3130 3131 __ mov(c_rarg0, r15_thread); 3132 3133 __ call(RuntimeAddress(destination)); 3134 3135 3136 // Set an oopmap for the call site. 3137 // We need this not only for callee-saved registers, but also for volatile 3138 // registers that the compiler might be keeping live across a safepoint. 3139 3140 oop_maps->add_gc_map( __ offset() - start, map); 3141 3142 // rax contains the address we are going to jump to assuming no exception got installed 3143 3144 // clear last_Java_sp 3145 __ reset_last_Java_frame(false); 3146 // check for pending exceptions 3147 Label pending; 3148 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3149 __ jcc(Assembler::notEqual, pending); 3150 3151 // get the returned Method* 3152 __ get_vm_result_metadata(rbx); 3153 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx); 3154 3155 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 3156 3157 RegisterSaver::restore_live_registers(masm); 3158 3159 // We are back to the original state on entry and ready to go. 3160 3161 __ jmp(rax); 3162 3163 // Pending exception after the safepoint 3164 3165 __ bind(pending); 3166 3167 RegisterSaver::restore_live_registers(masm); 3168 3169 // exception pending => remove activation and forward to exception handler 3170 3171 __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD); 3172 3173 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 3174 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3175 3176 // ------------- 3177 // make sure all code is generated 3178 masm->flush(); 3179 3180 // return the blob 3181 // frame_size_words or bytes?? 3182 RuntimeStub* rs_blob = RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); 3183 3184 AOTCodeCache::store_code_blob(*rs_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id)); 3185 return rs_blob; 3186 } 3187 3188 // Continuation point for throwing of implicit exceptions that are 3189 // not handled in the current activation. Fabricates an exception 3190 // oop and initiates normal exception dispatching in this 3191 // frame. Since we need to preserve callee-saved values (currently 3192 // only for C2, but done for C1 as well) we need a callee-saved oop 3193 // map and therefore have to make these stubs into RuntimeStubs 3194 // rather than BufferBlobs. If the compiler needs all registers to 3195 // be preserved between the fault point and the exception handler 3196 // then it must assume responsibility for that in 3197 // AbstractCompiler::continuation_for_implicit_null_exception or 3198 // continuation_for_implicit_division_by_zero_exception. All other 3199 // implicit exceptions (e.g., NullPointerException or 3200 // AbstractMethodError on entry) are either at call sites or 3201 // otherwise assume that stack unwinding will be initiated, so 3202 // caller saved registers were assumed volatile in the compiler. 3203 RuntimeStub* SharedRuntime::generate_throw_exception(StubId id, address runtime_entry) { 3204 assert(is_throw_id(id), "expected a throw stub id"); 3205 3206 const char* name = SharedRuntime::stub_name(id); 3207 3208 // Information about frame layout at time of blocking runtime call. 3209 // Note that we only have to preserve callee-saved registers since 3210 // the compilers are responsible for supplying a continuation point 3211 // if they expect all registers to be preserved. 3212 enum layout { 3213 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt, 3214 rbp_off2, 3215 return_off, 3216 return_off2, 3217 framesize // inclusive of return address 3218 }; 3219 3220 int insts_size = 512; 3221 int locs_size = 64; 3222 3223 const char* timer_msg = "SharedRuntime generate_throw_exception"; 3224 TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime)); 3225 3226 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id)); 3227 if (blob != nullptr) { 3228 return blob->as_runtime_stub(); 3229 } 3230 3231 ResourceMark rm; 3232 CodeBuffer code(name, insts_size, locs_size); 3233 OopMapSet* oop_maps = new OopMapSet(); 3234 MacroAssembler* masm = new MacroAssembler(&code); 3235 3236 address start = __ pc(); 3237 3238 // This is an inlined and slightly modified version of call_VM 3239 // which has the ability to fetch the return PC out of 3240 // thread-local storage and also sets up last_Java_sp slightly 3241 // differently than the real call_VM 3242 3243 __ enter(); // required for proper stackwalking of RuntimeStub frame 3244 3245 assert(is_even(framesize/2), "sp not 16-byte aligned"); 3246 3247 // return address and rbp are already in place 3248 __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog 3249 3250 int frame_complete = __ pc() - start; 3251 3252 // Set up last_Java_sp and last_Java_fp 3253 address the_pc = __ pc(); 3254 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 3255 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack 3256 3257 // Call runtime 3258 __ movptr(c_rarg0, r15_thread); 3259 BLOCK_COMMENT("call runtime_entry"); 3260 __ call(RuntimeAddress(runtime_entry)); 3261 3262 // Generate oop map 3263 OopMap* map = new OopMap(framesize, 0); 3264 3265 oop_maps->add_gc_map(the_pc - start, map); 3266 3267 __ reset_last_Java_frame(true); 3268 3269 __ leave(); // required for proper stackwalking of RuntimeStub frame 3270 3271 // check for pending exceptions 3272 #ifdef ASSERT 3273 Label L; 3274 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3275 __ jcc(Assembler::notEqual, L); 3276 __ should_not_reach_here(); 3277 __ bind(L); 3278 #endif // ASSERT 3279 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3280 3281 3282 // codeBlob framesize is in words (not VMRegImpl::slot_size) 3283 RuntimeStub* stub = 3284 RuntimeStub::new_runtime_stub(name, 3285 &code, 3286 frame_complete, 3287 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3288 oop_maps, false); 3289 AOTCodeCache::store_code_blob(*stub, AOTCodeEntry::SharedBlob, StubInfo::blob(id)); 3290 3291 return stub; 3292 } 3293 3294 //------------------------------Montgomery multiplication------------------------ 3295 // 3296 3297 #ifndef _WINDOWS 3298 3299 // Subtract 0:b from carry:a. Return carry. 3300 static julong 3301 sub(julong a[], julong b[], julong carry, long len) { 3302 long long i = 0, cnt = len; 3303 julong tmp; 3304 asm volatile("clc; " 3305 "0: ; " 3306 "mov (%[b], %[i], 8), %[tmp]; " 3307 "sbb %[tmp], (%[a], %[i], 8); " 3308 "inc %[i]; dec %[cnt]; " 3309 "jne 0b; " 3310 "mov %[carry], %[tmp]; sbb $0, %[tmp]; " 3311 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp) 3312 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry) 3313 : "memory"); 3314 return tmp; 3315 } 3316 3317 // Multiply (unsigned) Long A by Long B, accumulating the double- 3318 // length result into the accumulator formed of T0, T1, and T2. 3319 #define MACC(A, B, T0, T1, T2) \ 3320 do { \ 3321 unsigned long hi, lo; \ 3322 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3323 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3324 : "r"(A), "a"(B) : "cc"); \ 3325 } while(0) 3326 3327 // As above, but add twice the double-length result into the 3328 // accumulator. 3329 #define MACC2(A, B, T0, T1, T2) \ 3330 do { \ 3331 unsigned long hi, lo; \ 3332 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \ 3333 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3334 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3335 : "r"(A), "a"(B) : "cc"); \ 3336 } while(0) 3337 3338 #else //_WINDOWS 3339 3340 static julong 3341 sub(julong a[], julong b[], julong carry, long len) { 3342 long i; 3343 julong tmp; 3344 unsigned char c = 1; 3345 for (i = 0; i < len; i++) { 3346 c = _addcarry_u64(c, a[i], ~b[i], &tmp); 3347 a[i] = tmp; 3348 } 3349 c = _addcarry_u64(c, carry, ~0, &tmp); 3350 return tmp; 3351 } 3352 3353 // Multiply (unsigned) Long A by Long B, accumulating the double- 3354 // length result into the accumulator formed of T0, T1, and T2. 3355 #define MACC(A, B, T0, T1, T2) \ 3356 do { \ 3357 julong hi, lo; \ 3358 lo = _umul128(A, B, &hi); \ 3359 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3360 c = _addcarry_u64(c, hi, T1, &T1); \ 3361 _addcarry_u64(c, T2, 0, &T2); \ 3362 } while(0) 3363 3364 // As above, but add twice the double-length result into the 3365 // accumulator. 3366 #define MACC2(A, B, T0, T1, T2) \ 3367 do { \ 3368 julong hi, lo; \ 3369 lo = _umul128(A, B, &hi); \ 3370 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3371 c = _addcarry_u64(c, hi, T1, &T1); \ 3372 _addcarry_u64(c, T2, 0, &T2); \ 3373 c = _addcarry_u64(0, lo, T0, &T0); \ 3374 c = _addcarry_u64(c, hi, T1, &T1); \ 3375 _addcarry_u64(c, T2, 0, &T2); \ 3376 } while(0) 3377 3378 #endif //_WINDOWS 3379 3380 // Fast Montgomery multiplication. The derivation of the algorithm is 3381 // in A Cryptographic Library for the Motorola DSP56000, 3382 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3383 3384 static void NOINLINE 3385 montgomery_multiply(julong a[], julong b[], julong n[], 3386 julong m[], julong inv, int len) { 3387 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3388 int i; 3389 3390 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply"); 3391 3392 for (i = 0; i < len; i++) { 3393 int j; 3394 for (j = 0; j < i; j++) { 3395 MACC(a[j], b[i-j], t0, t1, t2); 3396 MACC(m[j], n[i-j], t0, t1, t2); 3397 } 3398 MACC(a[i], b[0], t0, t1, t2); 3399 m[i] = t0 * inv; 3400 MACC(m[i], n[0], t0, t1, t2); 3401 3402 assert(t0 == 0, "broken Montgomery multiply"); 3403 3404 t0 = t1; t1 = t2; t2 = 0; 3405 } 3406 3407 for (i = len; i < 2*len; i++) { 3408 int j; 3409 for (j = i-len+1; j < len; j++) { 3410 MACC(a[j], b[i-j], t0, t1, t2); 3411 MACC(m[j], n[i-j], t0, t1, t2); 3412 } 3413 m[i-len] = t0; 3414 t0 = t1; t1 = t2; t2 = 0; 3415 } 3416 3417 while (t0) 3418 t0 = sub(m, n, t0, len); 3419 } 3420 3421 // Fast Montgomery squaring. This uses asymptotically 25% fewer 3422 // multiplies so it should be up to 25% faster than Montgomery 3423 // multiplication. However, its loop control is more complex and it 3424 // may actually run slower on some machines. 3425 3426 static void NOINLINE 3427 montgomery_square(julong a[], julong n[], 3428 julong m[], julong inv, int len) { 3429 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3430 int i; 3431 3432 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square"); 3433 3434 for (i = 0; i < len; i++) { 3435 int j; 3436 int end = (i+1)/2; 3437 for (j = 0; j < end; j++) { 3438 MACC2(a[j], a[i-j], t0, t1, t2); 3439 MACC(m[j], n[i-j], t0, t1, t2); 3440 } 3441 if ((i & 1) == 0) { 3442 MACC(a[j], a[j], t0, t1, t2); 3443 } 3444 for (; j < i; j++) { 3445 MACC(m[j], n[i-j], t0, t1, t2); 3446 } 3447 m[i] = t0 * inv; 3448 MACC(m[i], n[0], t0, t1, t2); 3449 3450 assert(t0 == 0, "broken Montgomery square"); 3451 3452 t0 = t1; t1 = t2; t2 = 0; 3453 } 3454 3455 for (i = len; i < 2*len; i++) { 3456 int start = i-len+1; 3457 int end = start + (len - start)/2; 3458 int j; 3459 for (j = start; j < end; j++) { 3460 MACC2(a[j], a[i-j], t0, t1, t2); 3461 MACC(m[j], n[i-j], t0, t1, t2); 3462 } 3463 if ((i & 1) == 0) { 3464 MACC(a[j], a[j], t0, t1, t2); 3465 } 3466 for (; j < len; j++) { 3467 MACC(m[j], n[i-j], t0, t1, t2); 3468 } 3469 m[i-len] = t0; 3470 t0 = t1; t1 = t2; t2 = 0; 3471 } 3472 3473 while (t0) 3474 t0 = sub(m, n, t0, len); 3475 } 3476 3477 // Swap words in a longword. 3478 static julong swap(julong x) { 3479 return (x << 32) | (x >> 32); 3480 } 3481 3482 // Copy len longwords from s to d, word-swapping as we go. The 3483 // destination array is reversed. 3484 static void reverse_words(julong *s, julong *d, int len) { 3485 d += len; 3486 while(len-- > 0) { 3487 d--; 3488 *d = swap(*s); 3489 s++; 3490 } 3491 } 3492 3493 // The threshold at which squaring is advantageous was determined 3494 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. 3495 #define MONTGOMERY_SQUARING_THRESHOLD 64 3496 3497 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, 3498 jint len, jlong inv, 3499 jint *m_ints) { 3500 assert(len % 2 == 0, "array length in montgomery_multiply must be even"); 3501 int longwords = len/2; 3502 3503 // Make very sure we don't use so much space that the stack might 3504 // overflow. 512 jints corresponds to an 16384-bit integer and 3505 // will use here a total of 8k bytes of stack space. 3506 int divisor = sizeof(julong) * 4; 3507 guarantee(longwords <= 8192 / divisor, "must be"); 3508 int total_allocation = longwords * sizeof (julong) * 4; 3509 julong *scratch = (julong *)alloca(total_allocation); 3510 3511 // Local scratch arrays 3512 julong 3513 *a = scratch + 0 * longwords, 3514 *b = scratch + 1 * longwords, 3515 *n = scratch + 2 * longwords, 3516 *m = scratch + 3 * longwords; 3517 3518 reverse_words((julong *)a_ints, a, longwords); 3519 reverse_words((julong *)b_ints, b, longwords); 3520 reverse_words((julong *)n_ints, n, longwords); 3521 3522 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords); 3523 3524 reverse_words(m, (julong *)m_ints, longwords); 3525 } 3526 3527 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, 3528 jint len, jlong inv, 3529 jint *m_ints) { 3530 assert(len % 2 == 0, "array length in montgomery_square must be even"); 3531 int longwords = len/2; 3532 3533 // Make very sure we don't use so much space that the stack might 3534 // overflow. 512 jints corresponds to an 16384-bit integer and 3535 // will use here a total of 6k bytes of stack space. 3536 int divisor = sizeof(julong) * 3; 3537 guarantee(longwords <= (8192 / divisor), "must be"); 3538 int total_allocation = longwords * sizeof (julong) * 3; 3539 julong *scratch = (julong *)alloca(total_allocation); 3540 3541 // Local scratch arrays 3542 julong 3543 *a = scratch + 0 * longwords, 3544 *n = scratch + 1 * longwords, 3545 *m = scratch + 2 * longwords; 3546 3547 reverse_words((julong *)a_ints, a, longwords); 3548 reverse_words((julong *)n_ints, n, longwords); 3549 3550 if (len >= MONTGOMERY_SQUARING_THRESHOLD) { 3551 ::montgomery_square(a, n, m, (julong)inv, longwords); 3552 } else { 3553 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords); 3554 } 3555 3556 reverse_words(m, (julong *)m_ints, longwords); 3557 } 3558 3559 #if INCLUDE_JFR 3560 3561 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint. 3562 // It returns a jobject handle to the event writer. 3563 // The handle is dereferenced and the return value is the event writer oop. 3564 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() { 3565 enum layout { 3566 rbp_off, 3567 rbpH_off, 3568 return_off, 3569 return_off2, 3570 framesize // inclusive of return address 3571 }; 3572 3573 const char* name = SharedRuntime::stub_name(StubId::shared_jfr_write_checkpoint_id); 3574 CodeBuffer code(name, 1024, 64); 3575 MacroAssembler* masm = new MacroAssembler(&code); 3576 address start = __ pc(); 3577 3578 __ enter(); 3579 address the_pc = __ pc(); 3580 3581 int frame_complete = the_pc - start; 3582 3583 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 3584 __ movptr(c_rarg0, r15_thread); 3585 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1); 3586 __ reset_last_Java_frame(true); 3587 3588 // rax is jobject handle result, unpack and process it through a barrier. 3589 __ resolve_global_jobject(rax, c_rarg0); 3590 3591 __ leave(); 3592 __ ret(0); 3593 3594 OopMapSet* oop_maps = new OopMapSet(); 3595 OopMap* map = new OopMap(framesize, 1); 3596 oop_maps->add_gc_map(frame_complete, map); 3597 3598 RuntimeStub* stub = 3599 RuntimeStub::new_runtime_stub(name, 3600 &code, 3601 frame_complete, 3602 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3603 oop_maps, 3604 false); 3605 return stub; 3606 } 3607 3608 // For c2: call to return a leased buffer. 3609 RuntimeStub* SharedRuntime::generate_jfr_return_lease() { 3610 enum layout { 3611 rbp_off, 3612 rbpH_off, 3613 return_off, 3614 return_off2, 3615 framesize // inclusive of return address 3616 }; 3617 3618 const char* name = SharedRuntime::stub_name(StubId::shared_jfr_return_lease_id); 3619 CodeBuffer code(name, 1024, 64); 3620 MacroAssembler* masm = new MacroAssembler(&code); 3621 address start = __ pc(); 3622 3623 __ enter(); 3624 address the_pc = __ pc(); 3625 3626 int frame_complete = the_pc - start; 3627 3628 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2); 3629 __ movptr(c_rarg0, r15_thread); 3630 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1); 3631 __ reset_last_Java_frame(true); 3632 3633 __ leave(); 3634 __ ret(0); 3635 3636 OopMapSet* oop_maps = new OopMapSet(); 3637 OopMap* map = new OopMap(framesize, 1); 3638 oop_maps->add_gc_map(frame_complete, map); 3639 3640 RuntimeStub* stub = 3641 RuntimeStub::new_runtime_stub(name, 3642 &code, 3643 frame_complete, 3644 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3645 oop_maps, 3646 false); 3647 return stub; 3648 } 3649 3650 #endif // INCLUDE_JFR 3651