1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #ifndef _WINDOWS 26 #include "alloca.h" 27 #endif 28 #include "asm/macroAssembler.hpp" 29 #include "asm/macroAssembler.inline.hpp" 30 #include "classfile/symbolTable.hpp" 31 #include "code/aotCodeCache.hpp" 32 #include "code/compiledIC.hpp" 33 #include "code/debugInfoRec.hpp" 34 #include "code/nativeInst.hpp" 35 #include "code/vtableStubs.hpp" 36 #include "compiler/oopMap.hpp" 37 #include "gc/shared/collectedHeap.hpp" 38 #include "gc/shared/gcLocker.hpp" 39 #include "gc/shared/barrierSet.hpp" 40 #include "gc/shared/barrierSetAssembler.hpp" 41 #include "interpreter/interpreter.hpp" 42 #include "logging/log.hpp" 43 #include "memory/resourceArea.hpp" 44 #include "memory/universe.hpp" 45 #include "oops/klass.inline.hpp" 46 #include "oops/method.inline.hpp" 47 #include "prims/methodHandles.hpp" 48 #include "runtime/continuation.hpp" 49 #include "runtime/continuationEntry.inline.hpp" 50 #include "runtime/globals.hpp" 51 #include "runtime/jniHandles.hpp" 52 #include "runtime/safepointMechanism.hpp" 53 #include "runtime/sharedRuntime.hpp" 54 #include "runtime/signature.hpp" 55 #include "runtime/stubRoutines.hpp" 56 #include "runtime/timerTrace.hpp" 57 #include "runtime/vframeArray.hpp" 58 #include "runtime/vm_version.hpp" 59 #include "utilities/align.hpp" 60 #include "utilities/checkedCast.hpp" 61 #include "utilities/formatBuffer.hpp" 62 #include "vmreg_x86.inline.hpp" 63 #ifdef COMPILER1 64 #include "c1/c1_Runtime1.hpp" 65 #endif 66 #ifdef COMPILER2 67 #include "opto/runtime.hpp" 68 #endif 69 #if INCLUDE_JVMCI 70 #include "jvmci/jvmciJavaClasses.hpp" 71 #endif 72 73 #define __ masm-> 74 75 #ifdef PRODUCT 76 #define BLOCK_COMMENT(str) /* nothing */ 77 #else 78 #define BLOCK_COMMENT(str) __ block_comment(str) 79 #endif // PRODUCT 80 81 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size; 82 83 class RegisterSaver { 84 // Capture info about frame layout. Layout offsets are in jint 85 // units because compiler frame slots are jints. 86 #define XSAVE_AREA_BEGIN 160 87 #define XSAVE_AREA_YMM_BEGIN 576 88 #define XSAVE_AREA_EGPRS 960 89 #define XSAVE_AREA_OPMASK_BEGIN 1088 90 #define XSAVE_AREA_ZMM_BEGIN 1152 91 #define XSAVE_AREA_UPPERBANK 1664 92 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off 93 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off 94 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off 95 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off 96 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off 97 enum layout { 98 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area 99 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area 100 DEF_XMM_OFFS(0), 101 DEF_XMM_OFFS(1), 102 // 2..15 are implied in range usage 103 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 104 DEF_YMM_OFFS(0), 105 DEF_YMM_OFFS(1), 106 r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt, 107 r16H_off, 108 r17_off, r17H_off, 109 r18_off, r18H_off, 110 r19_off, r19H_off, 111 r20_off, r20H_off, 112 r21_off, r21H_off, 113 r22_off, r22H_off, 114 r23_off, r23H_off, 115 r24_off, r24H_off, 116 r25_off, r25H_off, 117 r26_off, r26H_off, 118 r27_off, r27H_off, 119 r28_off, r28H_off, 120 r29_off, r29H_off, 121 r30_off, r30H_off, 122 r31_off, r31H_off, 123 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 124 DEF_OPMASK_OFFS(0), 125 DEF_OPMASK_OFFS(1), 126 // 2..7 are implied in range usage 127 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 128 DEF_ZMM_OFFS(0), 129 DEF_ZMM_OFFS(1), 130 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt, 131 DEF_ZMM_UPPER_OFFS(16), 132 DEF_ZMM_UPPER_OFFS(17), 133 // 18..31 are implied in range usage 134 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt), 135 fpu_stateH_end, 136 r15_off, r15H_off, 137 r14_off, r14H_off, 138 r13_off, r13H_off, 139 r12_off, r12H_off, 140 r11_off, r11H_off, 141 r10_off, r10H_off, 142 r9_off, r9H_off, 143 r8_off, r8H_off, 144 rdi_off, rdiH_off, 145 rsi_off, rsiH_off, 146 ignore_off, ignoreH_off, // extra copy of rbp 147 rsp_off, rspH_off, 148 rbx_off, rbxH_off, 149 rdx_off, rdxH_off, 150 rcx_off, rcxH_off, 151 rax_off, raxH_off, 152 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state 153 align_off, alignH_off, 154 flags_off, flagsH_off, 155 // The frame sender code expects that rbp will be in the "natural" place and 156 // will override any oopMap setting for it. We must therefore force the layout 157 // so that it agrees with the frame sender code. 158 rbp_off, rbpH_off, // copy of rbp we will restore 159 return_off, returnH_off, // slot for return address 160 reg_save_size // size in compiler stack slots 161 }; 162 163 public: 164 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors); 165 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false); 166 167 // Offsets into the register save area 168 // Used by deoptimization when it is managing result register 169 // values on its own 170 171 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; } 172 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; } 173 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; } 174 static int r15_offset_in_bytes(void) { return BytesPerInt * r15_off; } 175 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; } 176 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; } 177 178 // During deoptimization only the result registers need to be restored, 179 // all the other values have already been extracted. 180 static void restore_result_registers(MacroAssembler* masm); 181 }; 182 183 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) { 184 int off = 0; 185 int num_xmm_regs = XMMRegister::available_xmm_registers(); 186 #if COMPILER2_OR_JVMCI 187 if (save_wide_vectors && UseAVX == 0) { 188 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX 189 } 190 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 191 #else 192 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI 193 #endif 194 195 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated 196 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs); 197 // OopMap frame size is in compiler stack slots (jint's) not bytes or words 198 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 199 // CodeBlob frame size is in words. 200 int frame_size_in_words = frame_size_in_bytes / wordSize; 201 *total_frame_words = frame_size_in_words; 202 203 // Save registers, fpu state, and flags. 204 // We assume caller has already pushed the return address onto the 205 // stack, so rsp is 8-byte aligned here. 206 // We push rpb twice in this sequence because we want the real rbp 207 // to be under the return like a normal enter. 208 209 __ enter(); // rsp becomes 16-byte aligned here 210 __ pushf(); 211 // Make sure rsp stays 16-byte aligned 212 __ subq(rsp, 8); 213 // Push CPU state in multiple of 16 bytes 214 __ save_legacy_gprs(); 215 __ push_FPU_state(); 216 217 218 // push cpu state handles this on EVEX enabled targets 219 if (save_wide_vectors) { 220 // Save upper half of YMM registers(0..15) 221 int base_addr = XSAVE_AREA_YMM_BEGIN; 222 for (int n = 0; n < 16; n++) { 223 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n)); 224 } 225 if (VM_Version::supports_evex()) { 226 // Save upper half of ZMM registers(0..15) 227 base_addr = XSAVE_AREA_ZMM_BEGIN; 228 for (int n = 0; n < 16; n++) { 229 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n)); 230 } 231 // Save full ZMM registers(16..num_xmm_regs) 232 base_addr = XSAVE_AREA_UPPERBANK; 233 off = 0; 234 int vector_len = Assembler::AVX_512bit; 235 for (int n = 16; n < num_xmm_regs; n++) { 236 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 237 } 238 #if COMPILER2_OR_JVMCI 239 base_addr = XSAVE_AREA_OPMASK_BEGIN; 240 off = 0; 241 for(int n = 0; n < KRegister::number_of_registers; n++) { 242 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 243 } 244 #endif 245 } 246 } else { 247 if (VM_Version::supports_evex()) { 248 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 249 int base_addr = XSAVE_AREA_UPPERBANK; 250 off = 0; 251 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 252 for (int n = 16; n < num_xmm_regs; n++) { 253 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 254 } 255 #if COMPILER2_OR_JVMCI 256 base_addr = XSAVE_AREA_OPMASK_BEGIN; 257 off = 0; 258 for(int n = 0; n < KRegister::number_of_registers; n++) { 259 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 260 } 261 #endif 262 } 263 } 264 265 #if COMPILER2_OR_JVMCI 266 if (UseAPX) { 267 int base_addr = XSAVE_AREA_EGPRS; 268 off = 0; 269 for (int n = 16; n < Register::number_of_registers; n++) { 270 __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n)); 271 } 272 } 273 #endif 274 275 __ vzeroupper(); 276 if (frame::arg_reg_save_area_bytes != 0) { 277 // Allocate argument register save area 278 __ subptr(rsp, frame::arg_reg_save_area_bytes); 279 } 280 281 // Set an oopmap for the call site. This oopmap will map all 282 // oop-registers and debug-info registers as callee-saved. This 283 // will allow deoptimization at this safepoint to find all possible 284 // debug-info recordings, as well as let GC find all oops. 285 286 OopMapSet *oop_maps = new OopMapSet(); 287 OopMap* map = new OopMap(frame_size_in_slots, 0); 288 289 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x)) 290 291 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg()); 292 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg()); 293 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg()); 294 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg()); 295 // rbp location is known implicitly by the frame sender code, needs no oopmap 296 // and the location where rbp was saved by is ignored 297 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg()); 298 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg()); 299 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg()); 300 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg()); 301 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg()); 302 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg()); 303 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg()); 304 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg()); 305 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg()); 306 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg()); 307 308 if (UseAPX) { 309 map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg()); 310 map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg()); 311 map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg()); 312 map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg()); 313 map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg()); 314 map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg()); 315 map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg()); 316 map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg()); 317 map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg()); 318 map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg()); 319 map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg()); 320 map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg()); 321 map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg()); 322 map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg()); 323 map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg()); 324 map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg()); 325 } 326 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 327 // on EVEX enabled targets, we get it included in the xsave area 328 off = xmm0_off; 329 int delta = xmm1_off - off; 330 for (int n = 0; n < 16; n++) { 331 XMMRegister xmm_name = as_XMMRegister(n); 332 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()); 333 off += delta; 334 } 335 if (UseAVX > 2) { 336 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 337 off = zmm16_off; 338 delta = zmm17_off - off; 339 for (int n = 16; n < num_xmm_regs; n++) { 340 XMMRegister zmm_name = as_XMMRegister(n); 341 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()); 342 off += delta; 343 } 344 } 345 346 #if COMPILER2_OR_JVMCI 347 if (save_wide_vectors) { 348 // Save upper half of YMM registers(0..15) 349 off = ymm0_off; 350 delta = ymm1_off - ymm0_off; 351 for (int n = 0; n < 16; n++) { 352 XMMRegister ymm_name = as_XMMRegister(n); 353 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4)); 354 off += delta; 355 } 356 if (VM_Version::supports_evex()) { 357 // Save upper half of ZMM registers(0..15) 358 off = zmm0_off; 359 delta = zmm1_off - zmm0_off; 360 for (int n = 0; n < 16; n++) { 361 XMMRegister zmm_name = as_XMMRegister(n); 362 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8)); 363 off += delta; 364 } 365 } 366 } 367 #endif // COMPILER2_OR_JVMCI 368 369 // %%% These should all be a waste but we'll keep things as they were for now 370 if (true) { 371 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next()); 372 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next()); 373 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next()); 374 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next()); 375 // rbp location is known implicitly by the frame sender code, needs no oopmap 376 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next()); 377 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next()); 378 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next()); 379 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next()); 380 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next()); 381 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next()); 382 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next()); 383 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next()); 384 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next()); 385 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next()); 386 if (UseAPX) { 387 map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next()); 388 map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next()); 389 map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next()); 390 map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next()); 391 map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next()); 392 map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next()); 393 map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next()); 394 map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next()); 395 map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next()); 396 map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next()); 397 map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next()); 398 map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next()); 399 map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next()); 400 map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next()); 401 map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next()); 402 map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next()); 403 } 404 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 405 // on EVEX enabled targets, we get it included in the xsave area 406 off = xmm0H_off; 407 delta = xmm1H_off - off; 408 for (int n = 0; n < 16; n++) { 409 XMMRegister xmm_name = as_XMMRegister(n); 410 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next()); 411 off += delta; 412 } 413 if (UseAVX > 2) { 414 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 415 off = zmm16H_off; 416 delta = zmm17H_off - off; 417 for (int n = 16; n < num_xmm_regs; n++) { 418 XMMRegister zmm_name = as_XMMRegister(n); 419 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next()); 420 off += delta; 421 } 422 } 423 } 424 425 return map; 426 } 427 428 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) { 429 int num_xmm_regs = XMMRegister::available_xmm_registers(); 430 if (frame::arg_reg_save_area_bytes != 0) { 431 // Pop arg register save area 432 __ addptr(rsp, frame::arg_reg_save_area_bytes); 433 } 434 435 #if COMPILER2_OR_JVMCI 436 if (restore_wide_vectors) { 437 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); 438 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 439 } 440 #else 441 assert(!restore_wide_vectors, "vectors are generated only by C2"); 442 #endif 443 444 __ vzeroupper(); 445 446 // On EVEX enabled targets everything is handled in pop fpu state 447 if (restore_wide_vectors) { 448 // Restore upper half of YMM registers (0..15) 449 int base_addr = XSAVE_AREA_YMM_BEGIN; 450 for (int n = 0; n < 16; n++) { 451 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16)); 452 } 453 if (VM_Version::supports_evex()) { 454 // Restore upper half of ZMM registers (0..15) 455 base_addr = XSAVE_AREA_ZMM_BEGIN; 456 for (int n = 0; n < 16; n++) { 457 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32)); 458 } 459 // Restore full ZMM registers(16..num_xmm_regs) 460 base_addr = XSAVE_AREA_UPPERBANK; 461 int vector_len = Assembler::AVX_512bit; 462 int off = 0; 463 for (int n = 16; n < num_xmm_regs; n++) { 464 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 465 } 466 #if COMPILER2_OR_JVMCI 467 base_addr = XSAVE_AREA_OPMASK_BEGIN; 468 off = 0; 469 for (int n = 0; n < KRegister::number_of_registers; n++) { 470 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 471 } 472 #endif 473 } 474 } else { 475 if (VM_Version::supports_evex()) { 476 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 477 int base_addr = XSAVE_AREA_UPPERBANK; 478 int off = 0; 479 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 480 for (int n = 16; n < num_xmm_regs; n++) { 481 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 482 } 483 #if COMPILER2_OR_JVMCI 484 base_addr = XSAVE_AREA_OPMASK_BEGIN; 485 off = 0; 486 for (int n = 0; n < KRegister::number_of_registers; n++) { 487 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 488 } 489 #endif 490 } 491 } 492 493 #if COMPILER2_OR_JVMCI 494 if (UseAPX) { 495 int base_addr = XSAVE_AREA_EGPRS; 496 int off = 0; 497 for (int n = 16; n < Register::number_of_registers; n++) { 498 __ movq(as_Register(n), Address(rsp, base_addr+(off++*8))); 499 } 500 } 501 #endif 502 503 // Recover CPU state 504 __ pop_FPU_state(); 505 __ restore_legacy_gprs(); 506 __ addq(rsp, 8); 507 __ popf(); 508 // Get the rbp described implicitly by the calling convention (no oopMap) 509 __ pop(rbp); 510 } 511 512 void RegisterSaver::restore_result_registers(MacroAssembler* masm) { 513 514 // Just restore result register. Only used by deoptimization. By 515 // now any callee save register that needs to be restored to a c2 516 // caller of the deoptee has been extracted into the vframeArray 517 // and will be stuffed into the c2i adapter we create for later 518 // restoration so only result registers need to be restored here. 519 520 // Restore fp result register 521 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes())); 522 // Restore integer result register 523 __ movptr(rax, Address(rsp, rax_offset_in_bytes())); 524 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes())); 525 526 // Pop all of the register save are off the stack except the return address 527 __ addptr(rsp, return_offset_in_bytes()); 528 } 529 530 // Is vector's size (in bytes) bigger than a size saved by default? 531 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions. 532 bool SharedRuntime::is_wide_vector(int size) { 533 return size > 16; 534 } 535 536 // --------------------------------------------------------------------------- 537 // Read the array of BasicTypes from a signature, and compute where the 538 // arguments should go. Values in the VMRegPair regs array refer to 4-byte 539 // quantities. Values less than VMRegImpl::stack0 are registers, those above 540 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer 541 // as framesizes are fixed. 542 // VMRegImpl::stack0 refers to the first slot 0(sp). 543 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher. 544 // Register up to Register::number_of_registers are the 64-bit 545 // integer registers. 546 547 // Note: the INPUTS in sig_bt are in units of Java argument words, which are 548 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit 549 // units regardless of build. Of course for i486 there is no 64 bit build 550 551 // The Java calling convention is a "shifted" version of the C ABI. 552 // By skipping the first C ABI register we can call non-static jni methods 553 // with small numbers of arguments without having to shuffle the arguments 554 // at all. Since we control the java ABI we ought to at least get some 555 // advantage out of it. 556 557 int SharedRuntime::java_calling_convention(const BasicType *sig_bt, 558 VMRegPair *regs, 559 int total_args_passed) { 560 561 // Create the mapping between argument positions and 562 // registers. 563 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = { 564 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5 565 }; 566 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = { 567 j_farg0, j_farg1, j_farg2, j_farg3, 568 j_farg4, j_farg5, j_farg6, j_farg7 569 }; 570 571 572 uint int_args = 0; 573 uint fp_args = 0; 574 uint stk_args = 0; 575 576 for (int i = 0; i < total_args_passed; i++) { 577 switch (sig_bt[i]) { 578 case T_BOOLEAN: 579 case T_CHAR: 580 case T_BYTE: 581 case T_SHORT: 582 case T_INT: 583 if (int_args < Argument::n_int_register_parameters_j) { 584 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 585 } else { 586 stk_args = align_up(stk_args, 2); 587 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 588 stk_args += 1; 589 } 590 break; 591 case T_VOID: 592 // halves of T_LONG or T_DOUBLE 593 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 594 regs[i].set_bad(); 595 break; 596 case T_LONG: 597 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 598 // fall through 599 case T_OBJECT: 600 case T_ARRAY: 601 case T_ADDRESS: 602 if (int_args < Argument::n_int_register_parameters_j) { 603 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 604 } else { 605 stk_args = align_up(stk_args, 2); 606 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 607 stk_args += 2; 608 } 609 break; 610 case T_FLOAT: 611 if (fp_args < Argument::n_float_register_parameters_j) { 612 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 613 } else { 614 stk_args = align_up(stk_args, 2); 615 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 616 stk_args += 1; 617 } 618 break; 619 case T_DOUBLE: 620 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 621 if (fp_args < Argument::n_float_register_parameters_j) { 622 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 623 } else { 624 stk_args = align_up(stk_args, 2); 625 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 626 stk_args += 2; 627 } 628 break; 629 default: 630 ShouldNotReachHere(); 631 break; 632 } 633 } 634 635 return stk_args; 636 } 637 638 // Same as java_calling_convention() but for multiple return 639 // values. There's no way to store them on the stack so if we don't 640 // have enough registers, multiple values can't be returned. 641 const uint SharedRuntime::java_return_convention_max_int = Argument::n_int_register_parameters_j+1; 642 const uint SharedRuntime::java_return_convention_max_float = Argument::n_float_register_parameters_j; 643 int SharedRuntime::java_return_convention(const BasicType *sig_bt, 644 VMRegPair *regs, 645 int total_args_passed) { 646 // Create the mapping between argument positions and 647 // registers. 648 static const Register INT_ArgReg[java_return_convention_max_int] = { 649 rax, j_rarg5, j_rarg4, j_rarg3, j_rarg2, j_rarg1, j_rarg0 650 }; 651 static const XMMRegister FP_ArgReg[java_return_convention_max_float] = { 652 j_farg0, j_farg1, j_farg2, j_farg3, 653 j_farg4, j_farg5, j_farg6, j_farg7 654 }; 655 656 657 uint int_args = 0; 658 uint fp_args = 0; 659 660 for (int i = 0; i < total_args_passed; i++) { 661 switch (sig_bt[i]) { 662 case T_BOOLEAN: 663 case T_CHAR: 664 case T_BYTE: 665 case T_SHORT: 666 case T_INT: 667 if (int_args < Argument::n_int_register_parameters_j+1) { 668 regs[i].set1(INT_ArgReg[int_args]->as_VMReg()); 669 int_args++; 670 } else { 671 return -1; 672 } 673 break; 674 case T_VOID: 675 // halves of T_LONG or T_DOUBLE 676 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 677 regs[i].set_bad(); 678 break; 679 case T_LONG: 680 assert(sig_bt[i + 1] == T_VOID, "expecting half"); 681 // fall through 682 case T_OBJECT: 683 case T_ARRAY: 684 case T_ADDRESS: 685 case T_METADATA: 686 if (int_args < Argument::n_int_register_parameters_j+1) { 687 regs[i].set2(INT_ArgReg[int_args]->as_VMReg()); 688 int_args++; 689 } else { 690 return -1; 691 } 692 break; 693 case T_FLOAT: 694 if (fp_args < Argument::n_float_register_parameters_j) { 695 regs[i].set1(FP_ArgReg[fp_args]->as_VMReg()); 696 fp_args++; 697 } else { 698 return -1; 699 } 700 break; 701 case T_DOUBLE: 702 assert(sig_bt[i + 1] == T_VOID, "expecting half"); 703 if (fp_args < Argument::n_float_register_parameters_j) { 704 regs[i].set2(FP_ArgReg[fp_args]->as_VMReg()); 705 fp_args++; 706 } else { 707 return -1; 708 } 709 break; 710 default: 711 ShouldNotReachHere(); 712 break; 713 } 714 } 715 716 return int_args + fp_args; 717 } 718 719 // Patch the callers callsite with entry to compiled code if it exists. 720 static void patch_callers_callsite(MacroAssembler *masm) { 721 Label L; 722 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 723 __ jcc(Assembler::equal, L); 724 725 // Save the current stack pointer 726 __ mov(r13, rsp); 727 // Schedule the branch target address early. 728 // Call into the VM to patch the caller, then jump to compiled callee 729 // rax isn't live so capture return address while we easily can 730 __ movptr(rax, Address(rsp, 0)); 731 732 // align stack so push_CPU_state doesn't fault 733 __ andptr(rsp, -(StackAlignmentInBytes)); 734 __ push_CPU_state(); 735 __ vzeroupper(); 736 // VM needs caller's callsite 737 // VM needs target method 738 // This needs to be a long call since we will relocate this adapter to 739 // the codeBuffer and it may not reach 740 741 // Allocate argument register save area 742 if (frame::arg_reg_save_area_bytes != 0) { 743 __ subptr(rsp, frame::arg_reg_save_area_bytes); 744 } 745 __ mov(c_rarg0, rbx); 746 __ mov(c_rarg1, rax); 747 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite))); 748 749 // De-allocate argument register save area 750 if (frame::arg_reg_save_area_bytes != 0) { 751 __ addptr(rsp, frame::arg_reg_save_area_bytes); 752 } 753 754 __ vzeroupper(); 755 __ pop_CPU_state(); 756 // restore sp 757 __ mov(rsp, r13); 758 __ bind(L); 759 } 760 761 // For each inline type argument, sig includes the list of fields of 762 // the inline type. This utility function computes the number of 763 // arguments for the call if inline types are passed by reference (the 764 // calling convention the interpreter expects). 765 static int compute_total_args_passed_int(const GrowableArray<SigEntry>* sig_extended) { 766 int total_args_passed = 0; 767 if (InlineTypePassFieldsAsArgs) { 768 for (int i = 0; i < sig_extended->length(); i++) { 769 BasicType bt = sig_extended->at(i)._bt; 770 if (bt == T_METADATA) { 771 // In sig_extended, an inline type argument starts with: 772 // T_METADATA, followed by the types of the fields of the 773 // inline type and T_VOID to mark the end of the value 774 // type. Inline types are flattened so, for instance, in the 775 // case of an inline type with an int field and an inline type 776 // field that itself has 2 fields, an int and a long: 777 // T_METADATA T_INT T_METADATA T_INT T_LONG T_VOID (second 778 // slot for the T_LONG) T_VOID (inner inline type) T_VOID 779 // (outer inline type) 780 total_args_passed++; 781 int vt = 1; 782 do { 783 i++; 784 BasicType bt = sig_extended->at(i)._bt; 785 BasicType prev_bt = sig_extended->at(i-1)._bt; 786 if (bt == T_METADATA) { 787 vt++; 788 } else if (bt == T_VOID && 789 prev_bt != T_LONG && 790 prev_bt != T_DOUBLE) { 791 vt--; 792 } 793 } while (vt != 0); 794 } else { 795 total_args_passed++; 796 } 797 } 798 } else { 799 total_args_passed = sig_extended->length(); 800 } 801 return total_args_passed; 802 } 803 804 805 static void gen_c2i_adapter_helper(MacroAssembler* masm, 806 BasicType bt, 807 BasicType prev_bt, 808 size_t size_in_bytes, 809 const VMRegPair& reg_pair, 810 const Address& to, 811 int extraspace, 812 bool is_oop) { 813 if (bt == T_VOID) { 814 assert(prev_bt == T_LONG || prev_bt == T_DOUBLE, "missing half"); 815 return; 816 } 817 818 // Say 4 args: 819 // i st_off 820 // 0 32 T_LONG 821 // 1 24 T_VOID 822 // 2 16 T_OBJECT 823 // 3 8 T_BOOL 824 // - 0 return address 825 // 826 // However to make thing extra confusing. Because we can fit a long/double in 827 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter 828 // leaves one slot empty and only stores to a single slot. In this case the 829 // slot that is occupied is the T_VOID slot. See I said it was confusing. 830 831 bool wide = (size_in_bytes == wordSize); 832 VMReg r_1 = reg_pair.first(); 833 VMReg r_2 = reg_pair.second(); 834 assert(r_2->is_valid() == wide, "invalid size"); 835 if (!r_1->is_valid()) { 836 assert(!r_2->is_valid(), "must be invalid"); 837 return; 838 } 839 840 if (!r_1->is_XMMRegister()) { 841 Register val = rax; 842 if (r_1->is_stack()) { 843 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace; 844 __ load_sized_value(val, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false); 845 } else { 846 val = r_1->as_Register(); 847 } 848 assert_different_registers(to.base(), val, rscratch1); 849 if (is_oop) { 850 __ push(r13); 851 __ push(rbx); 852 __ store_heap_oop(to, val, rscratch1, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED); 853 __ pop(rbx); 854 __ pop(r13); 855 } else { 856 __ store_sized_value(to, val, size_in_bytes); 857 } 858 } else { 859 if (wide) { 860 __ movdbl(to, r_1->as_XMMRegister()); 861 } else { 862 __ movflt(to, r_1->as_XMMRegister()); 863 } 864 } 865 } 866 867 static void gen_c2i_adapter(MacroAssembler *masm, 868 const GrowableArray<SigEntry>* sig_extended, 869 const VMRegPair *regs, 870 bool requires_clinit_barrier, 871 address& c2i_no_clinit_check_entry, 872 Label& skip_fixup, 873 address start, 874 OopMapSet* oop_maps, 875 int& frame_complete, 876 int& frame_size_in_words, 877 bool alloc_inline_receiver) { 878 if (requires_clinit_barrier && VM_Version::supports_fast_class_init_checks()) { 879 Label L_skip_barrier; 880 Register method = rbx; 881 882 { // Bypass the barrier for non-static methods 883 Register flags = rscratch1; 884 __ load_unsigned_short(flags, Address(method, Method::access_flags_offset())); 885 __ testl(flags, JVM_ACC_STATIC); 886 __ jcc(Assembler::zero, L_skip_barrier); // non-static 887 } 888 889 Register klass = rscratch1; 890 __ load_method_holder(klass, method); 891 __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/); 892 893 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 894 895 __ bind(L_skip_barrier); 896 c2i_no_clinit_check_entry = __ pc(); 897 } 898 899 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 900 bs->c2i_entry_barrier(masm); 901 902 // Before we get into the guts of the C2I adapter, see if we should be here 903 // at all. We've come from compiled code and are attempting to jump to the 904 // interpreter, which means the caller made a static call to get here 905 // (vcalls always get a compiled target if there is one). Check for a 906 // compiled target. If there is one, we need to patch the caller's call. 907 patch_callers_callsite(masm); 908 909 __ bind(skip_fixup); 910 911 if (InlineTypePassFieldsAsArgs) { 912 // Is there an inline type argument? 913 bool has_inline_argument = false; 914 for (int i = 0; i < sig_extended->length() && !has_inline_argument; i++) { 915 has_inline_argument = (sig_extended->at(i)._bt == T_METADATA); 916 } 917 if (has_inline_argument) { 918 // There is at least an inline type argument: we're coming from 919 // compiled code so we have no buffers to back the inline types. 920 // Allocate the buffers here with a runtime call. 921 OopMap* map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ false); 922 923 frame_complete = __ offset(); 924 925 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 926 927 __ mov(c_rarg0, r15_thread); 928 __ mov(c_rarg1, rbx); 929 __ mov64(c_rarg2, (int64_t)alloc_inline_receiver); 930 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::allocate_inline_types))); 931 932 oop_maps->add_gc_map((int)(__ pc() - start), map); 933 __ reset_last_Java_frame(false); 934 935 RegisterSaver::restore_live_registers(masm); 936 937 Label no_exception; 938 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 939 __ jcc(Assembler::equal, no_exception); 940 941 __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD); 942 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 943 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 944 945 __ bind(no_exception); 946 947 // We get an array of objects from the runtime call 948 __ get_vm_result_oop(rscratch2); // Use rscratch2 (r11) as temporary because rscratch1 (r10) is trashed by movptr() 949 __ get_vm_result_metadata(rbx); // TODO: required to keep the callee Method live? 950 } 951 } 952 953 // Since all args are passed on the stack, total_args_passed * 954 // Interpreter::stackElementSize is the space we need. 955 int total_args_passed = compute_total_args_passed_int(sig_extended); 956 assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed); 957 958 int extraspace = (total_args_passed * Interpreter::stackElementSize); 959 960 // stack is aligned, keep it that way 961 // This is not currently needed or enforced by the interpreter, but 962 // we might as well conform to the ABI. 963 extraspace = align_up(extraspace, 2*wordSize); 964 965 // set senderSP value 966 __ lea(r13, Address(rsp, wordSize)); 967 968 #ifdef ASSERT 969 __ check_stack_alignment(r13, "sender stack not aligned"); 970 #endif 971 if (extraspace > 0) { 972 // Pop the return address 973 __ pop(rax); 974 975 __ subptr(rsp, extraspace); 976 977 // Push the return address 978 __ push(rax); 979 980 // Account for the return address location since we store it first rather 981 // than hold it in a register across all the shuffling 982 extraspace += wordSize; 983 } 984 985 #ifdef ASSERT 986 __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax); 987 #endif 988 989 // Now write the args into the outgoing interpreter space 990 991 // next_arg_comp is the next argument from the compiler point of 992 // view (inline type fields are passed in registers/on the stack). In 993 // sig_extended, an inline type argument starts with: T_METADATA, 994 // followed by the types of the fields of the inline type and T_VOID 995 // to mark the end of the inline type. ignored counts the number of 996 // T_METADATA/T_VOID. next_vt_arg is the next inline type argument: 997 // used to get the buffer for that argument from the pool of buffers 998 // we allocated above and want to pass to the 999 // interpreter. next_arg_int is the next argument from the 1000 // interpreter point of view (inline types are passed by reference). 1001 for (int next_arg_comp = 0, ignored = 0, next_vt_arg = 0, next_arg_int = 0; 1002 next_arg_comp < sig_extended->length(); next_arg_comp++) { 1003 assert(ignored <= next_arg_comp, "shouldn't skip over more slots than there are arguments"); 1004 assert(next_arg_int <= total_args_passed, "more arguments for the interpreter than expected?"); 1005 BasicType bt = sig_extended->at(next_arg_comp)._bt; 1006 int st_off = (total_args_passed - next_arg_int) * Interpreter::stackElementSize; 1007 if (!InlineTypePassFieldsAsArgs || bt != T_METADATA) { 1008 int next_off = st_off - Interpreter::stackElementSize; 1009 const int offset = (bt == T_LONG || bt == T_DOUBLE) ? next_off : st_off; 1010 const VMRegPair reg_pair = regs[next_arg_comp-ignored]; 1011 size_t size_in_bytes = reg_pair.second()->is_valid() ? 8 : 4; 1012 gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL, 1013 size_in_bytes, reg_pair, Address(rsp, offset), extraspace, false); 1014 next_arg_int++; 1015 #ifdef ASSERT 1016 if (bt == T_LONG || bt == T_DOUBLE) { 1017 // Overwrite the unused slot with known junk 1018 __ mov64(rax, CONST64(0xdeadffffdeadaaaa)); 1019 __ movptr(Address(rsp, st_off), rax); 1020 } 1021 #endif /* ASSERT */ 1022 } else { 1023 ignored++; 1024 // get the buffer from the just allocated pool of buffers 1025 int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + next_vt_arg * type2aelembytes(T_OBJECT); 1026 __ load_heap_oop(r14, Address(rscratch2, index)); 1027 next_vt_arg++; next_arg_int++; 1028 int vt = 1; 1029 // write fields we get from compiled code in registers/stack 1030 // slots to the buffer: we know we are done with that inline type 1031 // argument when we hit the T_VOID that acts as an end of inline 1032 // type delimiter for this inline type. Inline types are flattened 1033 // so we might encounter embedded inline types. Each entry in 1034 // sig_extended contains a field offset in the buffer. 1035 Label L_null; 1036 do { 1037 next_arg_comp++; 1038 BasicType bt = sig_extended->at(next_arg_comp)._bt; 1039 BasicType prev_bt = sig_extended->at(next_arg_comp-1)._bt; 1040 if (bt == T_METADATA) { 1041 vt++; 1042 ignored++; 1043 } else if (bt == T_VOID && 1044 prev_bt != T_LONG && 1045 prev_bt != T_DOUBLE) { 1046 vt--; 1047 ignored++; 1048 } else { 1049 int off = sig_extended->at(next_arg_comp)._offset; 1050 if (off == -1) { 1051 // Nullable inline type argument, emit null check 1052 VMReg reg = regs[next_arg_comp-ignored].first(); 1053 Label L_notNull; 1054 if (reg->is_stack()) { 1055 int ld_off = reg->reg2stack() * VMRegImpl::stack_slot_size + extraspace; 1056 __ testb(Address(rsp, ld_off), 1); 1057 } else { 1058 __ testb(reg->as_Register(), 1); 1059 } 1060 __ jcc(Assembler::notZero, L_notNull); 1061 __ movptr(Address(rsp, st_off), 0); 1062 __ jmp(L_null); 1063 __ bind(L_notNull); 1064 continue; 1065 } 1066 assert(off > 0, "offset in object should be positive"); 1067 size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize; 1068 bool is_oop = is_reference_type(bt); 1069 gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL, 1070 size_in_bytes, regs[next_arg_comp-ignored], Address(r14, off), extraspace, is_oop); 1071 } 1072 } while (vt != 0); 1073 // pass the buffer to the interpreter 1074 __ movptr(Address(rsp, st_off), r14); 1075 __ bind(L_null); 1076 } 1077 } 1078 1079 // Schedule the branch target address early. 1080 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset()))); 1081 __ jmp(rcx); 1082 } 1083 1084 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm, 1085 int comp_args_on_stack, 1086 const GrowableArray<SigEntry>* sig, 1087 const VMRegPair *regs) { 1088 1089 // Note: r13 contains the senderSP on entry. We must preserve it since 1090 // we may do a i2c -> c2i transition if we lose a race where compiled 1091 // code goes non-entrant while we get args ready. 1092 // In addition we use r13 to locate all the interpreter args as 1093 // we must align the stack to 16 bytes on an i2c entry else we 1094 // lose alignment we expect in all compiled code and register 1095 // save code can segv when fxsave instructions find improperly 1096 // aligned stack pointer. 1097 1098 // Adapters can be frameless because they do not require the caller 1099 // to perform additional cleanup work, such as correcting the stack pointer. 1100 // An i2c adapter is frameless because the *caller* frame, which is interpreted, 1101 // routinely repairs its own stack pointer (from interpreter_frame_last_sp), 1102 // even if a callee has modified the stack pointer. 1103 // A c2i adapter is frameless because the *callee* frame, which is interpreted, 1104 // routinely repairs its caller's stack pointer (from sender_sp, which is set 1105 // up via the senderSP register). 1106 // In other words, if *either* the caller or callee is interpreted, we can 1107 // get the stack pointer repaired after a call. 1108 // This is why c2i and i2c adapters cannot be indefinitely composed. 1109 // In particular, if a c2i adapter were to somehow call an i2c adapter, 1110 // both caller and callee would be compiled methods, and neither would 1111 // clean up the stack pointer changes performed by the two adapters. 1112 // If this happens, control eventually transfers back to the compiled 1113 // caller, but with an uncorrected stack, causing delayed havoc. 1114 1115 // Must preserve original SP for loading incoming arguments because 1116 // we need to align the outgoing SP for compiled code. 1117 __ movptr(r11, rsp); 1118 1119 // Pick up the return address 1120 __ pop(rax); 1121 1122 // Convert 4-byte c2 stack slots to words. 1123 int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord; 1124 1125 if (comp_args_on_stack) { 1126 __ subptr(rsp, comp_words_on_stack * wordSize); 1127 } 1128 1129 // Ensure compiled code always sees stack at proper alignment 1130 __ andptr(rsp, -16); 1131 1132 // push the return address and misalign the stack that youngest frame always sees 1133 // as far as the placement of the call instruction 1134 __ push(rax); 1135 1136 // Put saved SP in another register 1137 const Register saved_sp = rax; 1138 __ movptr(saved_sp, r11); 1139 1140 // Will jump to the compiled code just as if compiled code was doing it. 1141 // Pre-load the register-jump target early, to schedule it better. 1142 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_inline_offset()))); 1143 1144 #if INCLUDE_JVMCI 1145 if (EnableJVMCI) { 1146 // check if this call should be routed towards a specific entry point 1147 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 1148 Label no_alternative_target; 1149 __ jcc(Assembler::equal, no_alternative_target); 1150 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset()))); 1151 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 1152 __ bind(no_alternative_target); 1153 } 1154 #endif // INCLUDE_JVMCI 1155 1156 int total_args_passed = sig->length(); 1157 1158 // Now generate the shuffle code. Pick up all register args and move the 1159 // rest through the floating point stack top. 1160 for (int i = 0; i < total_args_passed; i++) { 1161 BasicType bt = sig->at(i)._bt; 1162 if (bt == T_VOID) { 1163 // Longs and doubles are passed in native word order, but misaligned 1164 // in the 32-bit build. 1165 BasicType prev_bt = (i > 0) ? sig->at(i-1)._bt : T_ILLEGAL; 1166 assert(i > 0 && (prev_bt == T_LONG || prev_bt == T_DOUBLE), "missing half"); 1167 continue; 1168 } 1169 1170 // Pick up 0, 1 or 2 words from SP+offset. 1171 1172 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), 1173 "scrambled load targets?"); 1174 // Load in argument order going down. 1175 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize; 1176 // Point to interpreter value (vs. tag) 1177 int next_off = ld_off - Interpreter::stackElementSize; 1178 // 1179 // 1180 // 1181 VMReg r_1 = regs[i].first(); 1182 VMReg r_2 = regs[i].second(); 1183 if (!r_1->is_valid()) { 1184 assert(!r_2->is_valid(), ""); 1185 continue; 1186 } 1187 if (r_1->is_stack()) { 1188 // Convert stack slot to an SP offset (+ wordSize to account for return address ) 1189 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize; 1190 1191 // We can use r13 as a temp here because compiled code doesn't need r13 as an input 1192 // and if we end up going thru a c2i because of a miss a reasonable value of r13 1193 // will be generated. 1194 if (!r_2->is_valid()) { 1195 // sign extend??? 1196 __ movl(r13, Address(saved_sp, ld_off)); 1197 __ movptr(Address(rsp, st_off), r13); 1198 } else { 1199 // 1200 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 1201 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 1202 // So we must adjust where to pick up the data to match the interpreter. 1203 // 1204 // Interpreter local[n] == MSW, local[n+1] == LSW however locals 1205 // are accessed as negative so LSW is at LOW address 1206 1207 // ld_off is MSW so get LSW 1208 const int offset = (bt==T_LONG||bt==T_DOUBLE)? 1209 next_off : ld_off; 1210 __ movq(r13, Address(saved_sp, offset)); 1211 // st_off is LSW (i.e. reg.first()) 1212 __ movq(Address(rsp, st_off), r13); 1213 } 1214 } else if (r_1->is_Register()) { // Register argument 1215 Register r = r_1->as_Register(); 1216 assert(r != rax, "must be different"); 1217 if (r_2->is_valid()) { 1218 // 1219 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 1220 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 1221 // So we must adjust where to pick up the data to match the interpreter. 1222 1223 const int offset = (bt==T_LONG||bt==T_DOUBLE)? 1224 next_off : ld_off; 1225 1226 // this can be a misaligned move 1227 __ movq(r, Address(saved_sp, offset)); 1228 } else { 1229 // sign extend and use a full word? 1230 __ movl(r, Address(saved_sp, ld_off)); 1231 } 1232 } else { 1233 if (!r_2->is_valid()) { 1234 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off)); 1235 } else { 1236 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off)); 1237 } 1238 } 1239 } 1240 1241 __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about 1242 1243 // 6243940 We might end up in handle_wrong_method if 1244 // the callee is deoptimized as we race thru here. If that 1245 // happens we don't want to take a safepoint because the 1246 // caller frame will look interpreted and arguments are now 1247 // "compiled" so it is much better to make this transition 1248 // invisible to the stack walking code. Unfortunately if 1249 // we try and find the callee by normal means a safepoint 1250 // is possible. So we stash the desired callee in the thread 1251 // and the vm will find there should this case occur. 1252 1253 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx); 1254 1255 // put Method* where a c2i would expect should we end up there 1256 // only needed because of c2 resolve stubs return Method* as a result in 1257 // rax 1258 __ mov(rax, rbx); 1259 __ jmp(r11); 1260 } 1261 1262 static void gen_inline_cache_check(MacroAssembler *masm, Label& skip_fixup) { 1263 Register data = rax; 1264 __ ic_check(1 /* end_alignment */); 1265 __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset())); 1266 1267 // Method might have been compiled since the call site was patched to 1268 // interpreted if that is the case treat it as a miss so we can get 1269 // the call site corrected. 1270 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 1271 __ jcc(Assembler::equal, skip_fixup); 1272 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1273 } 1274 1275 // --------------------------------------------------------------- 1276 void SharedRuntime::generate_i2c2i_adapters(MacroAssembler* masm, 1277 int comp_args_on_stack, 1278 const GrowableArray<SigEntry>* sig, 1279 const VMRegPair* regs, 1280 const GrowableArray<SigEntry>* sig_cc, 1281 const VMRegPair* regs_cc, 1282 const GrowableArray<SigEntry>* sig_cc_ro, 1283 const VMRegPair* regs_cc_ro, 1284 AdapterHandlerEntry* handler, 1285 AdapterBlob*& new_adapter, 1286 bool allocate_code_blob) { 1287 address i2c_entry = __ pc(); 1288 gen_i2c_adapter(masm, comp_args_on_stack, sig, regs); 1289 1290 // ------------------------------------------------------------------------- 1291 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls 1292 // to the interpreter. The args start out packed in the compiled layout. They 1293 // need to be unpacked into the interpreter layout. This will almost always 1294 // require some stack space. We grow the current (compiled) stack, then repack 1295 // the args. We finally end in a jump to the generic interpreter entry point. 1296 // On exit from the interpreter, the interpreter will restore our SP (lest the 1297 // compiled code, which relies solely on SP and not RBP, get sick). 1298 1299 address c2i_unverified_entry = __ pc(); 1300 address c2i_unverified_inline_entry = __ pc(); 1301 Label skip_fixup; 1302 1303 gen_inline_cache_check(masm, skip_fixup); 1304 1305 OopMapSet* oop_maps = new OopMapSet(); 1306 int frame_complete = CodeOffsets::frame_never_safe; 1307 int frame_size_in_words = 0; 1308 1309 // Scalarized c2i adapter with non-scalarized receiver (i.e., don't pack receiver) 1310 address c2i_no_clinit_check_entry = nullptr; 1311 address c2i_inline_ro_entry = __ pc(); 1312 if (regs_cc != regs_cc_ro) { 1313 // No class init barrier needed because method is guaranteed to be non-static 1314 gen_c2i_adapter(masm, sig_cc_ro, regs_cc_ro, /* requires_clinit_barrier = */ false, c2i_no_clinit_check_entry, 1315 skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false); 1316 skip_fixup.reset(); 1317 } 1318 1319 // Scalarized c2i adapter 1320 address c2i_entry = __ pc(); 1321 address c2i_inline_entry = __ pc(); 1322 gen_c2i_adapter(masm, sig_cc, regs_cc, /* requires_clinit_barrier = */ true, c2i_no_clinit_check_entry, 1323 skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ true); 1324 1325 // Non-scalarized c2i adapter 1326 if (regs != regs_cc) { 1327 c2i_unverified_inline_entry = __ pc(); 1328 Label inline_entry_skip_fixup; 1329 gen_inline_cache_check(masm, inline_entry_skip_fixup); 1330 1331 c2i_inline_entry = __ pc(); 1332 gen_c2i_adapter(masm, sig, regs, /* requires_clinit_barrier = */ true, c2i_no_clinit_check_entry, 1333 inline_entry_skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false); 1334 } 1335 1336 // The c2i adapters might safepoint and trigger a GC. The caller must make sure that 1337 // the GC knows about the location of oop argument locations passed to the c2i adapter. 1338 if (allocate_code_blob) { 1339 bool caller_must_gc_arguments = (regs != regs_cc); 1340 new_adapter = AdapterBlob::create(masm->code(), frame_complete, frame_size_in_words, oop_maps, caller_must_gc_arguments); 1341 } 1342 1343 handler->set_entry_points(i2c_entry, c2i_entry, c2i_inline_entry, c2i_inline_ro_entry, c2i_unverified_entry, 1344 c2i_unverified_inline_entry, c2i_no_clinit_check_entry); 1345 } 1346 1347 int SharedRuntime::c_calling_convention(const BasicType *sig_bt, 1348 VMRegPair *regs, 1349 int total_args_passed) { 1350 1351 // We return the amount of VMRegImpl stack slots we need to reserve for all 1352 // the arguments NOT counting out_preserve_stack_slots. 1353 1354 // NOTE: These arrays will have to change when c1 is ported 1355 #ifdef _WIN64 1356 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1357 c_rarg0, c_rarg1, c_rarg2, c_rarg3 1358 }; 1359 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1360 c_farg0, c_farg1, c_farg2, c_farg3 1361 }; 1362 #else 1363 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1364 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5 1365 }; 1366 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1367 c_farg0, c_farg1, c_farg2, c_farg3, 1368 c_farg4, c_farg5, c_farg6, c_farg7 1369 }; 1370 #endif // _WIN64 1371 1372 1373 uint int_args = 0; 1374 uint fp_args = 0; 1375 uint stk_args = 0; // inc by 2 each time 1376 1377 for (int i = 0; i < total_args_passed; i++) { 1378 switch (sig_bt[i]) { 1379 case T_BOOLEAN: 1380 case T_CHAR: 1381 case T_BYTE: 1382 case T_SHORT: 1383 case T_INT: 1384 if (int_args < Argument::n_int_register_parameters_c) { 1385 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 1386 #ifdef _WIN64 1387 fp_args++; 1388 // Allocate slots for callee to stuff register args the stack. 1389 stk_args += 2; 1390 #endif 1391 } else { 1392 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1393 stk_args += 2; 1394 } 1395 break; 1396 case T_LONG: 1397 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1398 // fall through 1399 case T_OBJECT: 1400 case T_ARRAY: 1401 case T_ADDRESS: 1402 case T_METADATA: 1403 if (int_args < Argument::n_int_register_parameters_c) { 1404 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 1405 #ifdef _WIN64 1406 fp_args++; 1407 stk_args += 2; 1408 #endif 1409 } else { 1410 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1411 stk_args += 2; 1412 } 1413 break; 1414 case T_FLOAT: 1415 if (fp_args < Argument::n_float_register_parameters_c) { 1416 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 1417 #ifdef _WIN64 1418 int_args++; 1419 // Allocate slots for callee to stuff register args the stack. 1420 stk_args += 2; 1421 #endif 1422 } else { 1423 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1424 stk_args += 2; 1425 } 1426 break; 1427 case T_DOUBLE: 1428 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1429 if (fp_args < Argument::n_float_register_parameters_c) { 1430 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 1431 #ifdef _WIN64 1432 int_args++; 1433 // Allocate slots for callee to stuff register args the stack. 1434 stk_args += 2; 1435 #endif 1436 } else { 1437 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1438 stk_args += 2; 1439 } 1440 break; 1441 case T_VOID: // Halves of longs and doubles 1442 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 1443 regs[i].set_bad(); 1444 break; 1445 default: 1446 ShouldNotReachHere(); 1447 break; 1448 } 1449 } 1450 #ifdef _WIN64 1451 // windows abi requires that we always allocate enough stack space 1452 // for 4 64bit registers to be stored down. 1453 if (stk_args < 8) { 1454 stk_args = 8; 1455 } 1456 #endif // _WIN64 1457 1458 return stk_args; 1459 } 1460 1461 int SharedRuntime::vector_calling_convention(VMRegPair *regs, 1462 uint num_bits, 1463 uint total_args_passed) { 1464 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512, 1465 "only certain vector sizes are supported for now"); 1466 1467 static const XMMRegister VEC_ArgReg[32] = { 1468 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 1469 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1470 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23, 1471 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31 1472 }; 1473 1474 uint stk_args = 0; 1475 uint fp_args = 0; 1476 1477 for (uint i = 0; i < total_args_passed; i++) { 1478 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg(); 1479 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15)); 1480 regs[i].set_pair(vmreg->next(next_val), vmreg); 1481 } 1482 1483 return stk_args; 1484 } 1485 1486 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1487 // We always ignore the frame_slots arg and just use the space just below frame pointer 1488 // which by this time is free to use 1489 switch (ret_type) { 1490 case T_FLOAT: 1491 __ movflt(Address(rbp, -wordSize), xmm0); 1492 break; 1493 case T_DOUBLE: 1494 __ movdbl(Address(rbp, -wordSize), xmm0); 1495 break; 1496 case T_VOID: break; 1497 default: { 1498 __ movptr(Address(rbp, -wordSize), rax); 1499 } 1500 } 1501 } 1502 1503 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1504 // We always ignore the frame_slots arg and just use the space just below frame pointer 1505 // which by this time is free to use 1506 switch (ret_type) { 1507 case T_FLOAT: 1508 __ movflt(xmm0, Address(rbp, -wordSize)); 1509 break; 1510 case T_DOUBLE: 1511 __ movdbl(xmm0, Address(rbp, -wordSize)); 1512 break; 1513 case T_VOID: break; 1514 default: { 1515 __ movptr(rax, Address(rbp, -wordSize)); 1516 } 1517 } 1518 } 1519 1520 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1521 for ( int i = first_arg ; i < arg_count ; i++ ) { 1522 if (args[i].first()->is_Register()) { 1523 __ push(args[i].first()->as_Register()); 1524 } else if (args[i].first()->is_XMMRegister()) { 1525 __ subptr(rsp, 2*wordSize); 1526 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister()); 1527 } 1528 } 1529 } 1530 1531 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1532 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) { 1533 if (args[i].first()->is_Register()) { 1534 __ pop(args[i].first()->as_Register()); 1535 } else if (args[i].first()->is_XMMRegister()) { 1536 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0)); 1537 __ addptr(rsp, 2*wordSize); 1538 } 1539 } 1540 } 1541 1542 static void verify_oop_args(MacroAssembler* masm, 1543 const methodHandle& method, 1544 const BasicType* sig_bt, 1545 const VMRegPair* regs) { 1546 Register temp_reg = rbx; // not part of any compiled calling seq 1547 if (VerifyOops) { 1548 for (int i = 0; i < method->size_of_parameters(); i++) { 1549 if (is_reference_type(sig_bt[i])) { 1550 VMReg r = regs[i].first(); 1551 assert(r->is_valid(), "bad oop arg"); 1552 if (r->is_stack()) { 1553 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1554 __ verify_oop(temp_reg); 1555 } else { 1556 __ verify_oop(r->as_Register()); 1557 } 1558 } 1559 } 1560 } 1561 } 1562 1563 static void check_continuation_enter_argument(VMReg actual_vmreg, 1564 Register expected_reg, 1565 const char* name) { 1566 assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name); 1567 assert(actual_vmreg->as_Register() == expected_reg, 1568 "%s is in unexpected register: %s instead of %s", 1569 name, actual_vmreg->as_Register()->name(), expected_reg->name()); 1570 } 1571 1572 1573 //---------------------------- continuation_enter_setup --------------------------- 1574 // 1575 // Arguments: 1576 // None. 1577 // 1578 // Results: 1579 // rsp: pointer to blank ContinuationEntry 1580 // 1581 // Kills: 1582 // rax 1583 // 1584 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) { 1585 assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, ""); 1586 assert(in_bytes(ContinuationEntry::cont_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1587 assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1588 1589 stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize; 1590 __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1591 1592 int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size; 1593 OopMap* map = new OopMap(frame_size, 0); 1594 1595 __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset())); 1596 __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax); 1597 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp); 1598 1599 return map; 1600 } 1601 1602 //---------------------------- fill_continuation_entry --------------------------- 1603 // 1604 // Arguments: 1605 // rsp: pointer to blank Continuation entry 1606 // reg_cont_obj: pointer to the continuation 1607 // reg_flags: flags 1608 // 1609 // Results: 1610 // rsp: pointer to filled out ContinuationEntry 1611 // 1612 // Kills: 1613 // rax 1614 // 1615 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) { 1616 assert_different_registers(rax, reg_cont_obj, reg_flags); 1617 #ifdef ASSERT 1618 __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value()); 1619 #endif 1620 __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj); 1621 __ movl (Address(rsp, ContinuationEntry::flags_offset()), reg_flags); 1622 __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0); 1623 __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0); 1624 __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0); 1625 1626 __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset())); 1627 __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax); 1628 __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset())); 1629 __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax); 1630 1631 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0); 1632 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0); 1633 } 1634 1635 //---------------------------- continuation_enter_cleanup --------------------------- 1636 // 1637 // Arguments: 1638 // rsp: pointer to the ContinuationEntry 1639 // 1640 // Results: 1641 // rsp: pointer to the spilled rbp in the entry frame 1642 // 1643 // Kills: 1644 // rbx 1645 // 1646 static void continuation_enter_cleanup(MacroAssembler* masm) { 1647 #ifdef ASSERT 1648 Label L_good_sp; 1649 __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1650 __ jcc(Assembler::equal, L_good_sp); 1651 __ stop("Incorrect rsp at continuation_enter_cleanup"); 1652 __ bind(L_good_sp); 1653 #endif 1654 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset())); 1655 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx); 1656 1657 if (CheckJNICalls) { 1658 // Check if this is a virtual thread continuation 1659 Label L_skip_vthread_code; 1660 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0); 1661 __ jcc(Assembler::equal, L_skip_vthread_code); 1662 1663 // If the held monitor count is > 0 and this vthread is terminating then 1664 // it failed to release a JNI monitor. So we issue the same log message 1665 // that JavaThread::exit does. 1666 __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1667 __ jcc(Assembler::equal, L_skip_vthread_code); 1668 1669 // rax may hold an exception oop, save it before the call 1670 __ push(rax); 1671 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held)); 1672 __ pop(rax); 1673 1674 // For vthreads we have to explicitly zero the JNI monitor count of the carrier 1675 // on termination. The held count is implicitly zeroed below when we restore from 1676 // the parent held count (which has to be zero). 1677 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1678 1679 __ bind(L_skip_vthread_code); 1680 } 1681 #ifdef ASSERT 1682 else { 1683 // Check if this is a virtual thread continuation 1684 Label L_skip_vthread_code; 1685 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0); 1686 __ jcc(Assembler::equal, L_skip_vthread_code); 1687 1688 // See comment just above. If not checking JNI calls the JNI count is only 1689 // needed for assertion checking. 1690 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1691 1692 __ bind(L_skip_vthread_code); 1693 } 1694 #endif 1695 1696 __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset())); 1697 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx); 1698 1699 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset())); 1700 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx); 1701 __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1702 } 1703 1704 static void gen_continuation_enter(MacroAssembler* masm, 1705 const VMRegPair* regs, 1706 int& exception_offset, 1707 OopMapSet* oop_maps, 1708 int& frame_complete, 1709 int& stack_slots, 1710 int& interpreted_entry_offset, 1711 int& compiled_entry_offset) { 1712 1713 // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread) 1714 int pos_cont_obj = 0; 1715 int pos_is_cont = 1; 1716 int pos_is_virtual = 2; 1717 1718 // The platform-specific calling convention may present the arguments in various registers. 1719 // To simplify the rest of the code, we expect the arguments to reside at these known 1720 // registers, and we additionally check the placement here in case calling convention ever 1721 // changes. 1722 Register reg_cont_obj = c_rarg1; 1723 Register reg_is_cont = c_rarg2; 1724 Register reg_is_virtual = c_rarg3; 1725 1726 check_continuation_enter_argument(regs[pos_cont_obj].first(), reg_cont_obj, "Continuation object"); 1727 check_continuation_enter_argument(regs[pos_is_cont].first(), reg_is_cont, "isContinue"); 1728 check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread"); 1729 1730 // Utility methods kill rax, make sure there are no collisions 1731 assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual); 1732 1733 AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(), 1734 relocInfo::static_call_type); 1735 1736 address start = __ pc(); 1737 1738 Label L_thaw, L_exit; 1739 1740 // i2i entry used at interp_only_mode only 1741 interpreted_entry_offset = __ pc() - start; 1742 { 1743 #ifdef ASSERT 1744 Label is_interp_only; 1745 __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0); 1746 __ jcc(Assembler::notEqual, is_interp_only); 1747 __ stop("enterSpecial interpreter entry called when not in interp_only_mode"); 1748 __ bind(is_interp_only); 1749 #endif 1750 1751 __ pop(rax); // return address 1752 // Read interpreter arguments into registers (this is an ad-hoc i2c adapter) 1753 __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2)); 1754 __ movl(c_rarg2, Address(rsp, Interpreter::stackElementSize*1)); 1755 __ movl(c_rarg3, Address(rsp, Interpreter::stackElementSize*0)); 1756 __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment 1757 __ push(rax); // return address 1758 __ push_cont_fastpath(); 1759 1760 __ enter(); 1761 1762 stack_slots = 2; // will be adjusted in setup 1763 OopMap* map = continuation_enter_setup(masm, stack_slots); 1764 // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe, 1765 // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway. 1766 1767 __ verify_oop(reg_cont_obj); 1768 1769 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1770 1771 // If continuation, call to thaw. Otherwise, resolve the call and exit. 1772 __ testptr(reg_is_cont, reg_is_cont); 1773 __ jcc(Assembler::notZero, L_thaw); 1774 1775 // --- Resolve path 1776 1777 // Make sure the call is patchable 1778 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1779 // Emit stub for static call 1780 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc()); 1781 if (stub == nullptr) { 1782 fatal("CodeCache is full at gen_continuation_enter"); 1783 } 1784 __ call(resolve); 1785 oop_maps->add_gc_map(__ pc() - start, map); 1786 __ post_call_nop(); 1787 1788 __ jmp(L_exit); 1789 } 1790 1791 // compiled entry 1792 __ align(CodeEntryAlignment); 1793 compiled_entry_offset = __ pc() - start; 1794 __ enter(); 1795 1796 stack_slots = 2; // will be adjusted in setup 1797 OopMap* map = continuation_enter_setup(masm, stack_slots); 1798 1799 // Frame is now completed as far as size and linkage. 1800 frame_complete = __ pc() - start; 1801 1802 __ verify_oop(reg_cont_obj); 1803 1804 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1805 1806 // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue) 1807 __ testptr(reg_is_cont, reg_is_cont); 1808 __ jccb(Assembler::notZero, L_thaw); 1809 1810 // --- call Continuation.enter(Continuation c, boolean isContinue) 1811 1812 // Make sure the call is patchable 1813 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1814 1815 // Emit stub for static call 1816 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc()); 1817 if (stub == nullptr) { 1818 fatal("CodeCache is full at gen_continuation_enter"); 1819 } 1820 1821 // The call needs to be resolved. There's a special case for this in 1822 // SharedRuntime::find_callee_info_helper() which calls 1823 // LinkResolver::resolve_continuation_enter() which resolves the call to 1824 // Continuation.enter(Continuation c, boolean isContinue). 1825 __ call(resolve); 1826 1827 oop_maps->add_gc_map(__ pc() - start, map); 1828 __ post_call_nop(); 1829 1830 __ jmpb(L_exit); 1831 1832 // --- Thawing path 1833 1834 __ bind(L_thaw); 1835 1836 ContinuationEntry::_thaw_call_pc_offset = __ pc() - start; 1837 __ call(RuntimeAddress(StubRoutines::cont_thaw())); 1838 1839 ContinuationEntry::_return_pc_offset = __ pc() - start; 1840 oop_maps->add_gc_map(__ pc() - start, map->deep_copy()); 1841 __ post_call_nop(); 1842 1843 // --- Normal exit (resolve/thawing) 1844 1845 __ bind(L_exit); 1846 ContinuationEntry::_cleanup_offset = __ pc() - start; 1847 continuation_enter_cleanup(masm); 1848 __ pop(rbp); 1849 __ ret(0); 1850 1851 // --- Exception handling path 1852 1853 exception_offset = __ pc() - start; 1854 1855 continuation_enter_cleanup(masm); 1856 __ pop(rbp); 1857 1858 __ movptr(c_rarg0, r15_thread); 1859 __ movptr(c_rarg1, Address(rsp, 0)); // return address 1860 1861 // rax still holds the original exception oop, save it before the call 1862 __ push(rax); 1863 1864 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2); 1865 __ movptr(rbx, rax); 1866 1867 // Continue at exception handler: 1868 // rax: exception oop 1869 // rbx: exception handler 1870 // rdx: exception pc 1871 __ pop(rax); 1872 __ verify_oop(rax); 1873 __ pop(rdx); 1874 __ jmp(rbx); 1875 } 1876 1877 static void gen_continuation_yield(MacroAssembler* masm, 1878 const VMRegPair* regs, 1879 OopMapSet* oop_maps, 1880 int& frame_complete, 1881 int& stack_slots, 1882 int& compiled_entry_offset) { 1883 enum layout { 1884 rbp_off, 1885 rbpH_off, 1886 return_off, 1887 return_off2, 1888 framesize // inclusive of return address 1889 }; 1890 stack_slots = framesize / VMRegImpl::slots_per_word; 1891 assert(stack_slots == 2, "recheck layout"); 1892 1893 address start = __ pc(); 1894 compiled_entry_offset = __ pc() - start; 1895 __ enter(); 1896 address the_pc = __ pc(); 1897 1898 frame_complete = the_pc - start; 1899 1900 // This nop must be exactly at the PC we push into the frame info. 1901 // We use this nop for fast CodeBlob lookup, associate the OopMap 1902 // with it right away. 1903 __ post_call_nop(); 1904 OopMap* map = new OopMap(framesize, 1); 1905 oop_maps->add_gc_map(frame_complete, map); 1906 1907 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 1908 __ movptr(c_rarg0, r15_thread); 1909 __ movptr(c_rarg1, rsp); 1910 __ call_VM_leaf(Continuation::freeze_entry(), 2); 1911 __ reset_last_Java_frame(true); 1912 1913 Label L_pinned; 1914 1915 __ testptr(rax, rax); 1916 __ jcc(Assembler::notZero, L_pinned); 1917 1918 __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1919 continuation_enter_cleanup(masm); 1920 __ pop(rbp); 1921 __ ret(0); 1922 1923 __ bind(L_pinned); 1924 1925 // Pinned, return to caller 1926 1927 // handle pending exception thrown by freeze 1928 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 1929 Label ok; 1930 __ jcc(Assembler::equal, ok); 1931 __ leave(); 1932 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 1933 __ bind(ok); 1934 1935 __ leave(); 1936 __ ret(0); 1937 } 1938 1939 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) { 1940 ::continuation_enter_cleanup(masm); 1941 } 1942 1943 static void gen_special_dispatch(MacroAssembler* masm, 1944 const methodHandle& method, 1945 const BasicType* sig_bt, 1946 const VMRegPair* regs) { 1947 verify_oop_args(masm, method, sig_bt, regs); 1948 vmIntrinsics::ID iid = method->intrinsic_id(); 1949 1950 // Now write the args into the outgoing interpreter space 1951 bool has_receiver = false; 1952 Register receiver_reg = noreg; 1953 int member_arg_pos = -1; 1954 Register member_reg = noreg; 1955 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid); 1956 if (ref_kind != 0) { 1957 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument 1958 member_reg = rbx; // known to be free at this point 1959 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind); 1960 } else if (iid == vmIntrinsics::_invokeBasic) { 1961 has_receiver = true; 1962 } else if (iid == vmIntrinsics::_linkToNative) { 1963 member_arg_pos = method->size_of_parameters() - 1; // trailing NativeEntryPoint argument 1964 member_reg = rbx; // known to be free at this point 1965 } else { 1966 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid)); 1967 } 1968 1969 if (member_reg != noreg) { 1970 // Load the member_arg into register, if necessary. 1971 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs); 1972 VMReg r = regs[member_arg_pos].first(); 1973 if (r->is_stack()) { 1974 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1975 } else { 1976 // no data motion is needed 1977 member_reg = r->as_Register(); 1978 } 1979 } 1980 1981 if (has_receiver) { 1982 // Make sure the receiver is loaded into a register. 1983 assert(method->size_of_parameters() > 0, "oob"); 1984 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object"); 1985 VMReg r = regs[0].first(); 1986 assert(r->is_valid(), "bad receiver arg"); 1987 if (r->is_stack()) { 1988 // Porting note: This assumes that compiled calling conventions always 1989 // pass the receiver oop in a register. If this is not true on some 1990 // platform, pick a temp and load the receiver from stack. 1991 fatal("receiver always in a register"); 1992 receiver_reg = j_rarg0; // known to be free at this point 1993 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1994 } else { 1995 // no data motion is needed 1996 receiver_reg = r->as_Register(); 1997 } 1998 } 1999 2000 // Figure out which address we are really jumping to: 2001 MethodHandles::generate_method_handle_dispatch(masm, iid, 2002 receiver_reg, member_reg, /*for_compiler_entry:*/ true); 2003 } 2004 2005 // --------------------------------------------------------------------------- 2006 // Generate a native wrapper for a given method. The method takes arguments 2007 // in the Java compiled code convention, marshals them to the native 2008 // convention (handlizes oops, etc), transitions to native, makes the call, 2009 // returns to java state (possibly blocking), unhandlizes any result and 2010 // returns. 2011 // 2012 // Critical native functions are a shorthand for the use of 2013 // GetPrimtiveArrayCritical and disallow the use of any other JNI 2014 // functions. The wrapper is expected to unpack the arguments before 2015 // passing them to the callee. Critical native functions leave the state _in_Java, 2016 // since they cannot stop for GC. 2017 // Some other parts of JNI setup are skipped like the tear down of the JNI handle 2018 // block and the check for pending exceptions it's impossible for them 2019 // to be thrown. 2020 // 2021 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm, 2022 const methodHandle& method, 2023 int compile_id, 2024 BasicType* in_sig_bt, 2025 VMRegPair* in_regs, 2026 BasicType ret_type) { 2027 if (method->is_continuation_native_intrinsic()) { 2028 int exception_offset = -1; 2029 OopMapSet* oop_maps = new OopMapSet(); 2030 int frame_complete = -1; 2031 int stack_slots = -1; 2032 int interpreted_entry_offset = -1; 2033 int vep_offset = -1; 2034 if (method->is_continuation_enter_intrinsic()) { 2035 gen_continuation_enter(masm, 2036 in_regs, 2037 exception_offset, 2038 oop_maps, 2039 frame_complete, 2040 stack_slots, 2041 interpreted_entry_offset, 2042 vep_offset); 2043 } else if (method->is_continuation_yield_intrinsic()) { 2044 gen_continuation_yield(masm, 2045 in_regs, 2046 oop_maps, 2047 frame_complete, 2048 stack_slots, 2049 vep_offset); 2050 } else { 2051 guarantee(false, "Unknown Continuation native intrinsic"); 2052 } 2053 2054 #ifdef ASSERT 2055 if (method->is_continuation_enter_intrinsic()) { 2056 assert(interpreted_entry_offset != -1, "Must be set"); 2057 assert(exception_offset != -1, "Must be set"); 2058 } else { 2059 assert(interpreted_entry_offset == -1, "Must be unset"); 2060 assert(exception_offset == -1, "Must be unset"); 2061 } 2062 assert(frame_complete != -1, "Must be set"); 2063 assert(stack_slots != -1, "Must be set"); 2064 assert(vep_offset != -1, "Must be set"); 2065 #endif 2066 2067 __ flush(); 2068 nmethod* nm = nmethod::new_native_nmethod(method, 2069 compile_id, 2070 masm->code(), 2071 vep_offset, 2072 frame_complete, 2073 stack_slots, 2074 in_ByteSize(-1), 2075 in_ByteSize(-1), 2076 oop_maps, 2077 exception_offset); 2078 if (nm == nullptr) return nm; 2079 if (method->is_continuation_enter_intrinsic()) { 2080 ContinuationEntry::set_enter_code(nm, interpreted_entry_offset); 2081 } else if (method->is_continuation_yield_intrinsic()) { 2082 _cont_doYield_stub = nm; 2083 } 2084 return nm; 2085 } 2086 2087 if (method->is_method_handle_intrinsic()) { 2088 vmIntrinsics::ID iid = method->intrinsic_id(); 2089 intptr_t start = (intptr_t)__ pc(); 2090 int vep_offset = ((intptr_t)__ pc()) - start; 2091 gen_special_dispatch(masm, 2092 method, 2093 in_sig_bt, 2094 in_regs); 2095 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period 2096 __ flush(); 2097 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually 2098 return nmethod::new_native_nmethod(method, 2099 compile_id, 2100 masm->code(), 2101 vep_offset, 2102 frame_complete, 2103 stack_slots / VMRegImpl::slots_per_word, 2104 in_ByteSize(-1), 2105 in_ByteSize(-1), 2106 nullptr); 2107 } 2108 address native_func = method->native_function(); 2109 assert(native_func != nullptr, "must have function"); 2110 2111 // An OopMap for lock (and class if static) 2112 OopMapSet *oop_maps = new OopMapSet(); 2113 intptr_t start = (intptr_t)__ pc(); 2114 2115 // We have received a description of where all the java arg are located 2116 // on entry to the wrapper. We need to convert these args to where 2117 // the jni function will expect them. To figure out where they go 2118 // we convert the java signature to a C signature by inserting 2119 // the hidden arguments as arg[0] and possibly arg[1] (static method) 2120 2121 const int total_in_args = method->size_of_parameters(); 2122 int total_c_args = total_in_args + (method->is_static() ? 2 : 1); 2123 2124 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args); 2125 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args); 2126 2127 int argc = 0; 2128 out_sig_bt[argc++] = T_ADDRESS; 2129 if (method->is_static()) { 2130 out_sig_bt[argc++] = T_OBJECT; 2131 } 2132 2133 for (int i = 0; i < total_in_args ; i++ ) { 2134 out_sig_bt[argc++] = in_sig_bt[i]; 2135 } 2136 2137 // Now figure out where the args must be stored and how much stack space 2138 // they require. 2139 int out_arg_slots; 2140 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args); 2141 2142 // Compute framesize for the wrapper. We need to handlize all oops in 2143 // incoming registers 2144 2145 // Calculate the total number of stack slots we will need. 2146 2147 // First count the abi requirement plus all of the outgoing args 2148 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots; 2149 2150 // Now the space for the inbound oop handle area 2151 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers 2152 2153 int oop_handle_offset = stack_slots; 2154 stack_slots += total_save_slots; 2155 2156 // Now any space we need for handlizing a klass if static method 2157 2158 int klass_slot_offset = 0; 2159 int klass_offset = -1; 2160 int lock_slot_offset = 0; 2161 bool is_static = false; 2162 2163 if (method->is_static()) { 2164 klass_slot_offset = stack_slots; 2165 stack_slots += VMRegImpl::slots_per_word; 2166 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size; 2167 is_static = true; 2168 } 2169 2170 // Plus a lock if needed 2171 2172 if (method->is_synchronized()) { 2173 lock_slot_offset = stack_slots; 2174 stack_slots += VMRegImpl::slots_per_word; 2175 } 2176 2177 // Now a place (+2) to save return values or temp during shuffling 2178 // + 4 for return address (which we own) and saved rbp 2179 stack_slots += 6; 2180 2181 // Ok The space we have allocated will look like: 2182 // 2183 // 2184 // FP-> | | 2185 // |---------------------| 2186 // | 2 slots for moves | 2187 // |---------------------| 2188 // | lock box (if sync) | 2189 // |---------------------| <- lock_slot_offset 2190 // | klass (if static) | 2191 // |---------------------| <- klass_slot_offset 2192 // | oopHandle area | 2193 // |---------------------| <- oop_handle_offset (6 java arg registers) 2194 // | outbound memory | 2195 // | based arguments | 2196 // | | 2197 // |---------------------| 2198 // | | 2199 // SP-> | out_preserved_slots | 2200 // 2201 // 2202 2203 2204 // Now compute actual number of stack words we need rounding to make 2205 // stack properly aligned. 2206 stack_slots = align_up(stack_slots, StackAlignmentInSlots); 2207 2208 int stack_size = stack_slots * VMRegImpl::stack_slot_size; 2209 2210 // First thing make an ic check to see if we should even be here 2211 2212 // We are free to use all registers as temps without saving them and 2213 // restoring them except rbp. rbp is the only callee save register 2214 // as far as the interpreter and the compiler(s) are concerned. 2215 2216 const Register receiver = j_rarg0; 2217 2218 Label exception_pending; 2219 2220 assert_different_registers(receiver, rscratch1, rscratch2); 2221 __ verify_oop(receiver); 2222 __ ic_check(8 /* end_alignment */); 2223 2224 int vep_offset = ((intptr_t)__ pc()) - start; 2225 2226 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) { 2227 Label L_skip_barrier; 2228 Register klass = r10; 2229 __ mov_metadata(klass, method->method_holder()); // InstanceKlass* 2230 __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/); 2231 2232 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 2233 2234 __ bind(L_skip_barrier); 2235 } 2236 2237 #ifdef COMPILER1 2238 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available. 2239 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) { 2240 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/); 2241 } 2242 #endif // COMPILER1 2243 2244 // The instruction at the verified entry point must be 5 bytes or longer 2245 // because it can be patched on the fly by make_non_entrant. The stack bang 2246 // instruction fits that requirement. 2247 2248 // Generate stack overflow check 2249 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size()); 2250 2251 // Generate a new frame for the wrapper. 2252 __ enter(); 2253 // -2 because return address is already present and so is saved rbp 2254 __ subptr(rsp, stack_size - 2*wordSize); 2255 2256 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2257 // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub 2258 bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */); 2259 2260 // Frame is now completed as far as size and linkage. 2261 int frame_complete = ((intptr_t)__ pc()) - start; 2262 2263 #ifdef ASSERT 2264 __ check_stack_alignment(rsp, "improperly aligned stack"); 2265 #endif /* ASSERT */ 2266 2267 2268 // We use r14 as the oop handle for the receiver/klass 2269 // It is callee save so it survives the call to native 2270 2271 const Register oop_handle_reg = r14; 2272 2273 // 2274 // We immediately shuffle the arguments so that any vm call we have to 2275 // make from here on out (sync slow path, jvmti, etc.) we will have 2276 // captured the oops from our caller and have a valid oopMap for 2277 // them. 2278 2279 // ----------------- 2280 // The Grand Shuffle 2281 2282 // The Java calling convention is either equal (linux) or denser (win64) than the 2283 // c calling convention. However the because of the jni_env argument the c calling 2284 // convention always has at least one more (and two for static) arguments than Java. 2285 // Therefore if we move the args from java -> c backwards then we will never have 2286 // a register->register conflict and we don't have to build a dependency graph 2287 // and figure out how to break any cycles. 2288 // 2289 2290 // Record esp-based slot for receiver on stack for non-static methods 2291 int receiver_offset = -1; 2292 2293 // This is a trick. We double the stack slots so we can claim 2294 // the oops in the caller's frame. Since we are sure to have 2295 // more args than the caller doubling is enough to make 2296 // sure we can capture all the incoming oop args from the 2297 // caller. 2298 // 2299 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/); 2300 2301 // Mark location of rbp (someday) 2302 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp)); 2303 2304 // Use eax, ebx as temporaries during any memory-memory moves we have to do 2305 // All inbound args are referenced based on rbp and all outbound args via rsp. 2306 2307 2308 #ifdef ASSERT 2309 bool reg_destroyed[Register::number_of_registers]; 2310 bool freg_destroyed[XMMRegister::number_of_registers]; 2311 for ( int r = 0 ; r < Register::number_of_registers ; r++ ) { 2312 reg_destroyed[r] = false; 2313 } 2314 for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) { 2315 freg_destroyed[f] = false; 2316 } 2317 2318 #endif /* ASSERT */ 2319 2320 // For JNI natives the incoming and outgoing registers are offset upwards. 2321 GrowableArray<int> arg_order(2 * total_in_args); 2322 2323 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 2324 arg_order.push(i); 2325 arg_order.push(c_arg); 2326 } 2327 2328 for (int ai = 0; ai < arg_order.length(); ai += 2) { 2329 int i = arg_order.at(ai); 2330 int c_arg = arg_order.at(ai + 1); 2331 __ block_comment(err_msg("move %d -> %d", i, c_arg)); 2332 #ifdef ASSERT 2333 if (in_regs[i].first()->is_Register()) { 2334 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!"); 2335 } else if (in_regs[i].first()->is_XMMRegister()) { 2336 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!"); 2337 } 2338 if (out_regs[c_arg].first()->is_Register()) { 2339 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 2340 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 2341 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 2342 } 2343 #endif /* ASSERT */ 2344 switch (in_sig_bt[i]) { 2345 case T_ARRAY: 2346 case T_OBJECT: 2347 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg], 2348 ((i == 0) && (!is_static)), 2349 &receiver_offset); 2350 break; 2351 case T_VOID: 2352 break; 2353 2354 case T_FLOAT: 2355 __ float_move(in_regs[i], out_regs[c_arg]); 2356 break; 2357 2358 case T_DOUBLE: 2359 assert( i + 1 < total_in_args && 2360 in_sig_bt[i + 1] == T_VOID && 2361 out_sig_bt[c_arg+1] == T_VOID, "bad arg list"); 2362 __ double_move(in_regs[i], out_regs[c_arg]); 2363 break; 2364 2365 case T_LONG : 2366 __ long_move(in_regs[i], out_regs[c_arg]); 2367 break; 2368 2369 case T_ADDRESS: assert(false, "found T_ADDRESS in java args"); 2370 2371 default: 2372 __ move32_64(in_regs[i], out_regs[c_arg]); 2373 } 2374 } 2375 2376 int c_arg; 2377 2378 // Pre-load a static method's oop into r14. Used both by locking code and 2379 // the normal JNI call code. 2380 // point c_arg at the first arg that is already loaded in case we 2381 // need to spill before we call out 2382 c_arg = total_c_args - total_in_args; 2383 2384 if (method->is_static()) { 2385 2386 // load oop into a register 2387 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror())); 2388 2389 // Now handlize the static class mirror it's known not-null. 2390 __ movptr(Address(rsp, klass_offset), oop_handle_reg); 2391 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset)); 2392 2393 // Now get the handle 2394 __ lea(oop_handle_reg, Address(rsp, klass_offset)); 2395 // store the klass handle as second argument 2396 __ movptr(c_rarg1, oop_handle_reg); 2397 // and protect the arg if we must spill 2398 c_arg--; 2399 } 2400 2401 // Change state to native (we save the return address in the thread, since it might not 2402 // be pushed on the stack when we do a stack traversal). It is enough that the pc() 2403 // points into the right code segment. It does not have to be the correct return pc. 2404 // We use the same pc/oopMap repeatedly when we call out 2405 2406 Label native_return; 2407 if (LockingMode != LM_LEGACY && method->is_object_wait0()) { 2408 // For convenience we use the pc we want to resume to in case of preemption on Object.wait. 2409 __ set_last_Java_frame(rsp, noreg, native_return, rscratch1); 2410 } else { 2411 intptr_t the_pc = (intptr_t) __ pc(); 2412 oop_maps->add_gc_map(the_pc - start, map); 2413 2414 __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1); 2415 } 2416 2417 // We have all of the arguments setup at this point. We must not touch any register 2418 // argument registers at this point (what if we save/restore them there are no oop? 2419 2420 if (DTraceMethodProbes) { 2421 // protect the args we've loaded 2422 save_args(masm, total_c_args, c_arg, out_regs); 2423 __ mov_metadata(c_rarg1, method()); 2424 __ call_VM_leaf( 2425 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry), 2426 r15_thread, c_rarg1); 2427 restore_args(masm, total_c_args, c_arg, out_regs); 2428 } 2429 2430 // RedefineClasses() tracing support for obsolete method entry 2431 if (log_is_enabled(Trace, redefine, class, obsolete)) { 2432 // protect the args we've loaded 2433 save_args(masm, total_c_args, c_arg, out_regs); 2434 __ mov_metadata(c_rarg1, method()); 2435 __ call_VM_leaf( 2436 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry), 2437 r15_thread, c_rarg1); 2438 restore_args(masm, total_c_args, c_arg, out_regs); 2439 } 2440 2441 // Lock a synchronized method 2442 2443 // Register definitions used by locking and unlocking 2444 2445 const Register swap_reg = rax; // Must use rax for cmpxchg instruction 2446 const Register obj_reg = rbx; // Will contain the oop 2447 const Register lock_reg = r13; // Address of compiler lock object (BasicLock) 2448 const Register old_hdr = r13; // value of old header at unlock time 2449 2450 Label slow_path_lock; 2451 Label lock_done; 2452 2453 if (method->is_synchronized()) { 2454 Label count_mon; 2455 2456 const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes(); 2457 2458 // Get the handle (the 2nd argument) 2459 __ mov(oop_handle_reg, c_rarg1); 2460 2461 // Get address of the box 2462 2463 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2464 2465 // Load the oop from the handle 2466 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2467 2468 if (LockingMode == LM_MONITOR) { 2469 __ jmp(slow_path_lock); 2470 } else if (LockingMode == LM_LEGACY) { 2471 // Load immediate 1 into swap_reg %rax 2472 __ movl(swap_reg, 1); 2473 2474 // Load (object->mark() | 1) into swap_reg %rax 2475 __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2476 if (EnableValhalla) { 2477 // Mask inline_type bit such that we go to the slow path if object is an inline type 2478 __ andptr(swap_reg, ~((int) markWord::inline_type_bit_in_place)); 2479 } 2480 2481 // Save (object->mark() | 1) into BasicLock's displaced header 2482 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2483 2484 // src -> dest iff dest == rax else rax <- dest 2485 __ lock(); 2486 __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2487 __ jcc(Assembler::equal, count_mon); 2488 2489 // Hmm should this move to the slow path code area??? 2490 2491 // Test if the oopMark is an obvious stack pointer, i.e., 2492 // 1) (mark & 3) == 0, and 2493 // 2) rsp <= mark < mark + os::pagesize() 2494 // These 3 tests can be done by evaluating the following 2495 // expression: ((mark - rsp) & (3 - os::vm_page_size())), 2496 // assuming both stack pointer and pagesize have their 2497 // least significant 2 bits clear. 2498 // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg 2499 2500 __ subptr(swap_reg, rsp); 2501 __ andptr(swap_reg, 3 - (int)os::vm_page_size()); 2502 2503 // Save the test result, for recursive case, the result is zero 2504 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2505 __ jcc(Assembler::notEqual, slow_path_lock); 2506 2507 __ bind(count_mon); 2508 __ inc_held_monitor_count(); 2509 } else { 2510 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2511 __ lightweight_lock(lock_reg, obj_reg, swap_reg, rscratch1, slow_path_lock); 2512 } 2513 2514 // Slow path will re-enter here 2515 __ bind(lock_done); 2516 } 2517 2518 // Finally just about ready to make the JNI call 2519 2520 // get JNIEnv* which is first argument to native 2521 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset()))); 2522 2523 // Now set thread in native 2524 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 2525 2526 __ call(RuntimeAddress(native_func)); 2527 2528 // Verify or restore cpu control state after JNI call 2529 __ restore_cpu_control_state_after_jni(rscratch1); 2530 2531 // Unpack native results. 2532 switch (ret_type) { 2533 case T_BOOLEAN: __ c2bool(rax); break; 2534 case T_CHAR : __ movzwl(rax, rax); break; 2535 case T_BYTE : __ sign_extend_byte (rax); break; 2536 case T_SHORT : __ sign_extend_short(rax); break; 2537 case T_INT : /* nothing to do */ break; 2538 case T_DOUBLE : 2539 case T_FLOAT : 2540 // Result is in xmm0 we'll save as needed 2541 break; 2542 case T_ARRAY: // Really a handle 2543 case T_OBJECT: // Really a handle 2544 break; // can't de-handlize until after safepoint check 2545 case T_VOID: break; 2546 case T_LONG: break; 2547 default : ShouldNotReachHere(); 2548 } 2549 2550 // Switch thread to "native transition" state before reading the synchronization state. 2551 // This additional state is necessary because reading and testing the synchronization 2552 // state is not atomic w.r.t. GC, as this scenario demonstrates: 2553 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted. 2554 // VM thread changes sync state to synchronizing and suspends threads for GC. 2555 // Thread A is resumed to finish this native method, but doesn't block here since it 2556 // didn't see any synchronization is progress, and escapes. 2557 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 2558 2559 // Force this write out before the read below 2560 if (!UseSystemMemoryBarrier) { 2561 __ membar(Assembler::Membar_mask_bits( 2562 Assembler::LoadLoad | Assembler::LoadStore | 2563 Assembler::StoreLoad | Assembler::StoreStore)); 2564 } 2565 2566 // check for safepoint operation in progress and/or pending suspend requests 2567 { 2568 Label Continue; 2569 Label slow_path; 2570 2571 __ safepoint_poll(slow_path, true /* at_return */, false /* in_nmethod */); 2572 2573 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2574 __ jcc(Assembler::equal, Continue); 2575 __ bind(slow_path); 2576 2577 // Don't use call_VM as it will see a possible pending exception and forward it 2578 // and never return here preventing us from clearing _last_native_pc down below. 2579 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are 2580 // preserved and correspond to the bcp/locals pointers. So we do a runtime call 2581 // by hand. 2582 // 2583 __ vzeroupper(); 2584 save_native_result(masm, ret_type, stack_slots); 2585 __ mov(c_rarg0, r15_thread); 2586 __ mov(r12, rsp); // remember sp 2587 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2588 __ andptr(rsp, -16); // align stack as required by ABI 2589 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 2590 __ mov(rsp, r12); // restore sp 2591 __ reinit_heapbase(); 2592 // Restore any method result value 2593 restore_native_result(masm, ret_type, stack_slots); 2594 __ bind(Continue); 2595 } 2596 2597 // change thread state 2598 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 2599 2600 if (LockingMode != LM_LEGACY && method->is_object_wait0()) { 2601 // Check preemption for Object.wait() 2602 __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset())); 2603 __ cmpptr(rscratch1, NULL_WORD); 2604 __ jccb(Assembler::equal, native_return); 2605 __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD); 2606 __ jmp(rscratch1); 2607 __ bind(native_return); 2608 2609 intptr_t the_pc = (intptr_t) __ pc(); 2610 oop_maps->add_gc_map(the_pc - start, map); 2611 } 2612 2613 2614 Label reguard; 2615 Label reguard_done; 2616 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 2617 __ jcc(Assembler::equal, reguard); 2618 __ bind(reguard_done); 2619 2620 // native result if any is live 2621 2622 // Unlock 2623 Label slow_path_unlock; 2624 Label unlock_done; 2625 if (method->is_synchronized()) { 2626 2627 Label fast_done; 2628 2629 // Get locked oop from the handle we passed to jni 2630 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2631 2632 if (LockingMode == LM_LEGACY) { 2633 Label not_recur; 2634 // Simple recursive lock? 2635 __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD); 2636 __ jcc(Assembler::notEqual, not_recur); 2637 __ dec_held_monitor_count(); 2638 __ jmpb(fast_done); 2639 __ bind(not_recur); 2640 } 2641 2642 // Must save rax if it is live now because cmpxchg must use it 2643 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2644 save_native_result(masm, ret_type, stack_slots); 2645 } 2646 2647 if (LockingMode == LM_MONITOR) { 2648 __ jmp(slow_path_unlock); 2649 } else if (LockingMode == LM_LEGACY) { 2650 // get address of the stack lock 2651 __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2652 // get old displaced header 2653 __ movptr(old_hdr, Address(rax, 0)); 2654 2655 // Atomic swap old header if oop still contains the stack lock 2656 __ lock(); 2657 __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2658 __ jcc(Assembler::notEqual, slow_path_unlock); 2659 __ dec_held_monitor_count(); 2660 } else { 2661 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2662 __ lightweight_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock); 2663 } 2664 2665 // slow path re-enters here 2666 __ bind(unlock_done); 2667 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2668 restore_native_result(masm, ret_type, stack_slots); 2669 } 2670 2671 __ bind(fast_done); 2672 } 2673 if (DTraceMethodProbes) { 2674 save_native_result(masm, ret_type, stack_slots); 2675 __ mov_metadata(c_rarg1, method()); 2676 __ call_VM_leaf( 2677 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit), 2678 r15_thread, c_rarg1); 2679 restore_native_result(masm, ret_type, stack_slots); 2680 } 2681 2682 __ reset_last_Java_frame(false); 2683 2684 // Unbox oop result, e.g. JNIHandles::resolve value. 2685 if (is_reference_type(ret_type)) { 2686 __ resolve_jobject(rax /* value */, 2687 rcx /* tmp */); 2688 } 2689 2690 if (CheckJNICalls) { 2691 // clear_pending_jni_exception_check 2692 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD); 2693 } 2694 2695 // reset handle block 2696 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset())); 2697 __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD); 2698 2699 // pop our frame 2700 2701 __ leave(); 2702 2703 #if INCLUDE_JFR 2704 // We need to do a poll test after unwind in case the sampler 2705 // managed to sample the native frame after returning to Java. 2706 Label L_return; 2707 address poll_test_pc = __ pc(); 2708 __ relocate(relocInfo::poll_return_type); 2709 __ testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit()); 2710 __ jccb(Assembler::zero, L_return); 2711 __ lea(rscratch1, InternalAddress(poll_test_pc)); 2712 __ movptr(Address(r15_thread, JavaThread::saved_exception_pc_offset()), rscratch1); 2713 assert(SharedRuntime::polling_page_return_handler_blob() != nullptr, 2714 "polling page return stub not created yet"); 2715 address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point(); 2716 __ jump(RuntimeAddress(stub)); 2717 __ bind(L_return); 2718 #endif // INCLUDE_JFR 2719 2720 // Any exception pending? 2721 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2722 __ jcc(Assembler::notEqual, exception_pending); 2723 2724 // Return 2725 2726 __ ret(0); 2727 2728 // Unexpected paths are out of line and go here 2729 2730 // forward the exception 2731 __ bind(exception_pending); 2732 2733 // and forward the exception 2734 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 2735 2736 // Slow path locking & unlocking 2737 if (method->is_synchronized()) { 2738 2739 // BEGIN Slow path lock 2740 __ bind(slow_path_lock); 2741 2742 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM 2743 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2744 2745 // protect the args we've loaded 2746 save_args(masm, total_c_args, c_arg, out_regs); 2747 2748 __ mov(c_rarg0, obj_reg); 2749 __ mov(c_rarg1, lock_reg); 2750 __ mov(c_rarg2, r15_thread); 2751 2752 // Not a leaf but we have last_Java_frame setup as we want. 2753 // We don't want to unmount in case of contention since that would complicate preserving 2754 // the arguments that had already been marshalled into the native convention. So we force 2755 // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame()) 2756 // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack. 2757 __ push_cont_fastpath(); 2758 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3); 2759 __ pop_cont_fastpath(); 2760 restore_args(masm, total_c_args, c_arg, out_regs); 2761 2762 #ifdef ASSERT 2763 { Label L; 2764 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2765 __ jcc(Assembler::equal, L); 2766 __ stop("no pending exception allowed on exit from monitorenter"); 2767 __ bind(L); 2768 } 2769 #endif 2770 __ jmp(lock_done); 2771 2772 // END Slow path lock 2773 2774 // BEGIN Slow path unlock 2775 __ bind(slow_path_unlock); 2776 2777 // If we haven't already saved the native result we must save it now as xmm registers 2778 // are still exposed. 2779 __ vzeroupper(); 2780 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2781 save_native_result(masm, ret_type, stack_slots); 2782 } 2783 2784 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2785 2786 __ mov(c_rarg0, obj_reg); 2787 __ mov(c_rarg2, r15_thread); 2788 __ mov(r12, rsp); // remember sp 2789 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2790 __ andptr(rsp, -16); // align stack as required by ABI 2791 2792 // Save pending exception around call to VM (which contains an EXCEPTION_MARK) 2793 // NOTE that obj_reg == rbx currently 2794 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset()))); 2795 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2796 2797 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2798 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C))); 2799 __ mov(rsp, r12); // restore sp 2800 __ reinit_heapbase(); 2801 #ifdef ASSERT 2802 { 2803 Label L; 2804 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2805 __ jcc(Assembler::equal, L); 2806 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C"); 2807 __ bind(L); 2808 } 2809 #endif /* ASSERT */ 2810 2811 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx); 2812 2813 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2814 restore_native_result(masm, ret_type, stack_slots); 2815 } 2816 __ jmp(unlock_done); 2817 2818 // END Slow path unlock 2819 2820 } // synchronized 2821 2822 // SLOW PATH Reguard the stack if needed 2823 2824 __ bind(reguard); 2825 __ vzeroupper(); 2826 save_native_result(masm, ret_type, stack_slots); 2827 __ mov(r12, rsp); // remember sp 2828 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2829 __ andptr(rsp, -16); // align stack as required by ABI 2830 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 2831 __ mov(rsp, r12); // restore sp 2832 __ reinit_heapbase(); 2833 restore_native_result(masm, ret_type, stack_slots); 2834 // and continue 2835 __ jmp(reguard_done); 2836 2837 2838 2839 __ flush(); 2840 2841 nmethod *nm = nmethod::new_native_nmethod(method, 2842 compile_id, 2843 masm->code(), 2844 vep_offset, 2845 frame_complete, 2846 stack_slots / VMRegImpl::slots_per_word, 2847 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)), 2848 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size), 2849 oop_maps); 2850 2851 return nm; 2852 } 2853 2854 // this function returns the adjust size (in number of words) to a c2i adapter 2855 // activation for use during deoptimization 2856 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) { 2857 return (callee_locals - callee_parameters) * Interpreter::stackElementWords; 2858 } 2859 2860 2861 uint SharedRuntime::out_preserve_stack_slots() { 2862 return 0; 2863 } 2864 2865 2866 // Number of stack slots between incoming argument block and the start of 2867 // a new frame. The PROLOG must add this many slots to the stack. The 2868 // EPILOG must remove this many slots. amd64 needs two slots for 2869 // return address. 2870 uint SharedRuntime::in_preserve_stack_slots() { 2871 return 4 + 2 * VerifyStackAtCalls; 2872 } 2873 2874 VMReg SharedRuntime::thread_register() { 2875 return r15_thread->as_VMReg(); 2876 } 2877 2878 //------------------------------generate_deopt_blob---------------------------- 2879 void SharedRuntime::generate_deopt_blob() { 2880 // Allocate space for the code 2881 ResourceMark rm; 2882 // Setup code generation tools 2883 int pad = 0; 2884 if (UseAVX > 2) { 2885 pad += 1024; 2886 } 2887 if (UseAPX) { 2888 pad += 1024; 2889 } 2890 #if INCLUDE_JVMCI 2891 if (EnableJVMCI) { 2892 pad += 512; // Increase the buffer size when compiling for JVMCI 2893 } 2894 #endif 2895 const char* name = SharedRuntime::stub_name(SharedStubId::deopt_id); 2896 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, (uint)SharedStubId::deopt_id, name); 2897 if (blob != nullptr) { 2898 _deopt_blob = blob->as_deoptimization_blob(); 2899 return; 2900 } 2901 2902 CodeBuffer buffer(name, 2560+pad, 1024); 2903 MacroAssembler* masm = new MacroAssembler(&buffer); 2904 int frame_size_in_words; 2905 OopMap* map = nullptr; 2906 OopMapSet *oop_maps = new OopMapSet(); 2907 2908 // ------------- 2909 // This code enters when returning to a de-optimized nmethod. A return 2910 // address has been pushed on the stack, and return values are in 2911 // registers. 2912 // If we are doing a normal deopt then we were called from the patched 2913 // nmethod from the point we returned to the nmethod. So the return 2914 // address on the stack is wrong by NativeCall::instruction_size 2915 // We will adjust the value so it looks like we have the original return 2916 // address on the stack (like when we eagerly deoptimized). 2917 // In the case of an exception pending when deoptimizing, we enter 2918 // with a return address on the stack that points after the call we patched 2919 // into the exception handler. We have the following register state from, 2920 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp). 2921 // rax: exception oop 2922 // rbx: exception handler 2923 // rdx: throwing pc 2924 // So in this case we simply jam rdx into the useless return address and 2925 // the stack looks just like we want. 2926 // 2927 // At this point we need to de-opt. We save the argument return 2928 // registers. We call the first C routine, fetch_unroll_info(). This 2929 // routine captures the return values and returns a structure which 2930 // describes the current frame size and the sizes of all replacement frames. 2931 // The current frame is compiled code and may contain many inlined 2932 // functions, each with their own JVM state. We pop the current frame, then 2933 // push all the new frames. Then we call the C routine unpack_frames() to 2934 // populate these frames. Finally unpack_frames() returns us the new target 2935 // address. Notice that callee-save registers are BLOWN here; they have 2936 // already been captured in the vframeArray at the time the return PC was 2937 // patched. 2938 address start = __ pc(); 2939 Label cont; 2940 2941 // Prolog for non exception case! 2942 2943 // Save everything in sight. 2944 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2945 2946 // Normal deoptimization. Save exec mode for unpack_frames. 2947 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved 2948 __ jmp(cont); 2949 2950 int reexecute_offset = __ pc() - start; 2951 #if INCLUDE_JVMCI && !defined(COMPILER1) 2952 if (UseJVMCICompiler) { 2953 // JVMCI does not use this kind of deoptimization 2954 __ should_not_reach_here(); 2955 } 2956 #endif 2957 2958 // Reexecute case 2959 // return address is the pc describes what bci to do re-execute at 2960 2961 // No need to update map as each call to save_live_registers will produce identical oopmap 2962 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2963 2964 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved 2965 __ jmp(cont); 2966 2967 #if INCLUDE_JVMCI 2968 Label after_fetch_unroll_info_call; 2969 int implicit_exception_uncommon_trap_offset = 0; 2970 int uncommon_trap_offset = 0; 2971 2972 if (EnableJVMCI) { 2973 implicit_exception_uncommon_trap_offset = __ pc() - start; 2974 2975 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset()))); 2976 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD); 2977 2978 uncommon_trap_offset = __ pc() - start; 2979 2980 // Save everything in sight. 2981 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2982 // fetch_unroll_info needs to call last_java_frame() 2983 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 2984 2985 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset()))); 2986 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1); 2987 2988 __ movl(r14, Deoptimization::Unpack_reexecute); 2989 __ mov(c_rarg0, r15_thread); 2990 __ movl(c_rarg2, r14); // exec mode 2991 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 2992 oop_maps->add_gc_map( __ pc()-start, map->deep_copy()); 2993 2994 __ reset_last_Java_frame(false); 2995 2996 __ jmp(after_fetch_unroll_info_call); 2997 } // EnableJVMCI 2998 #endif // INCLUDE_JVMCI 2999 3000 int exception_offset = __ pc() - start; 3001 3002 // Prolog for exception case 3003 3004 // all registers are dead at this entry point, except for rax, and 3005 // rdx which contain the exception oop and exception pc 3006 // respectively. Set them in TLS and fall thru to the 3007 // unpack_with_exception_in_tls entry point. 3008 3009 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 3010 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax); 3011 3012 int exception_in_tls_offset = __ pc() - start; 3013 3014 // new implementation because exception oop is now passed in JavaThread 3015 3016 // Prolog for exception case 3017 // All registers must be preserved because they might be used by LinearScan 3018 // Exceptiop oop and throwing PC are passed in JavaThread 3019 // tos: stack at point of call to method that threw the exception (i.e. only 3020 // args are on the stack, no return address) 3021 3022 // make room on stack for the return address 3023 // It will be patched later with the throwing pc. The correct value is not 3024 // available now because loading it from memory would destroy registers. 3025 __ push(0); 3026 3027 // Save everything in sight. 3028 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 3029 3030 // Now it is safe to overwrite any register 3031 3032 // Deopt during an exception. Save exec mode for unpack_frames. 3033 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved 3034 3035 // load throwing pc from JavaThread and patch it as the return address 3036 // of the current frame. Then clear the field in JavaThread 3037 3038 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 3039 __ movptr(Address(rbp, wordSize), rdx); 3040 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 3041 3042 #ifdef ASSERT 3043 // verify that there is really an exception oop in JavaThread 3044 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 3045 __ verify_oop(rax); 3046 3047 // verify that there is no pending exception 3048 Label no_pending_exception; 3049 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 3050 __ testptr(rax, rax); 3051 __ jcc(Assembler::zero, no_pending_exception); 3052 __ stop("must not have pending exception here"); 3053 __ bind(no_pending_exception); 3054 #endif 3055 3056 __ bind(cont); 3057 3058 // Call C code. Need thread and this frame, but NOT official VM entry 3059 // crud. We cannot block on this call, no GC can happen. 3060 // 3061 // UnrollBlock* fetch_unroll_info(JavaThread* thread) 3062 3063 // fetch_unroll_info needs to call last_java_frame(). 3064 3065 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 3066 #ifdef ASSERT 3067 { Label L; 3068 __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD); 3069 __ jcc(Assembler::equal, L); 3070 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared"); 3071 __ bind(L); 3072 } 3073 #endif // ASSERT 3074 __ mov(c_rarg0, r15_thread); 3075 __ movl(c_rarg1, r14); // exec_mode 3076 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info))); 3077 3078 // Need to have an oopmap that tells fetch_unroll_info where to 3079 // find any register it might need. 3080 oop_maps->add_gc_map(__ pc() - start, map); 3081 3082 __ reset_last_Java_frame(false); 3083 3084 #if INCLUDE_JVMCI 3085 if (EnableJVMCI) { 3086 __ bind(after_fetch_unroll_info_call); 3087 } 3088 #endif 3089 3090 // Load UnrollBlock* into rdi 3091 __ mov(rdi, rax); 3092 3093 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset())); 3094 Label noException; 3095 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending? 3096 __ jcc(Assembler::notEqual, noException); 3097 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 3098 // QQQ this is useless it was null above 3099 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 3100 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD); 3101 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 3102 3103 __ verify_oop(rax); 3104 3105 // Overwrite the result registers with the exception results. 3106 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 3107 // I think this is useless 3108 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx); 3109 3110 __ bind(noException); 3111 3112 // Only register save data is on the stack. 3113 // Now restore the result registers. Everything else is either dead 3114 // or captured in the vframeArray. 3115 RegisterSaver::restore_result_registers(masm); 3116 3117 // All of the register save area has been popped of the stack. Only the 3118 // return address remains. 3119 3120 // Pop all the frames we must move/replace. 3121 // 3122 // Frame picture (youngest to oldest) 3123 // 1: self-frame (no frame link) 3124 // 2: deopting frame (no frame link) 3125 // 3: caller of deopting frame (could be compiled/interpreted). 3126 // 3127 // Note: by leaving the return address of self-frame on the stack 3128 // and using the size of frame 2 to adjust the stack 3129 // when we are done the return to frame 3 will still be on the stack. 3130 3131 // Pop deoptimized frame 3132 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset())); 3133 __ addptr(rsp, rcx); 3134 3135 // rsp should be pointing at the return address to the caller (3) 3136 3137 // Pick up the initial fp we should save 3138 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 3139 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset())); 3140 3141 #ifdef ASSERT 3142 // Compilers generate code that bang the stack by as much as the 3143 // interpreter would need. So this stack banging should never 3144 // trigger a fault. Verify that it does not on non product builds. 3145 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset())); 3146 __ bang_stack_size(rbx, rcx); 3147 #endif 3148 3149 // Load address of array of frame pcs into rcx 3150 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset())); 3151 3152 // Trash the old pc 3153 __ addptr(rsp, wordSize); 3154 3155 // Load address of array of frame sizes into rsi 3156 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset())); 3157 3158 // Load counter into rdx 3159 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset())); 3160 3161 // Now adjust the caller's stack to make up for the extra locals 3162 // but record the original sp so that we can save it in the skeletal interpreter 3163 // frame and the stack walking of interpreter_sender will get the unextended sp 3164 // value and not the "real" sp value. 3165 3166 const Register sender_sp = r8; 3167 3168 __ mov(sender_sp, rsp); 3169 __ movl(rbx, Address(rdi, 3170 Deoptimization::UnrollBlock:: 3171 caller_adjustment_offset())); 3172 __ subptr(rsp, rbx); 3173 3174 // Push interpreter frames in a loop 3175 Label loop; 3176 __ bind(loop); 3177 __ movptr(rbx, Address(rsi, 0)); // Load frame size 3178 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand 3179 __ pushptr(Address(rcx, 0)); // Save return address 3180 __ enter(); // Save old & set new ebp 3181 __ subptr(rsp, rbx); // Prolog 3182 // This value is corrected by layout_activation_impl 3183 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD); 3184 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable 3185 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 3186 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 3187 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 3188 __ decrementl(rdx); // Decrement counter 3189 __ jcc(Assembler::notZero, loop); 3190 __ pushptr(Address(rcx, 0)); // Save final return address 3191 3192 // Re-push self-frame 3193 __ enter(); // Save old & set new ebp 3194 3195 // Allocate a full sized register save area. 3196 // Return address and rbp are in place, so we allocate two less words. 3197 __ subptr(rsp, (frame_size_in_words - 2) * wordSize); 3198 3199 // Restore frame locals after moving the frame 3200 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0); 3201 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 3202 3203 // Call C code. Need thread but NOT official VM entry 3204 // crud. We cannot block on this call, no GC can happen. Call should 3205 // restore return values to their stack-slots with the new SP. 3206 // 3207 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode) 3208 3209 // Use rbp because the frames look interpreted now 3210 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 3211 // Don't need the precise return PC here, just precise enough to point into this code blob. 3212 address the_pc = __ pc(); 3213 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1); 3214 3215 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI 3216 __ mov(c_rarg0, r15_thread); 3217 __ movl(c_rarg1, r14); // second arg: exec_mode 3218 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 3219 // Revert SP alignment after call since we're going to do some SP relative addressing below 3220 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset())); 3221 3222 // Set an oopmap for the call site 3223 // Use the same PC we used for the last java frame 3224 oop_maps->add_gc_map(the_pc - start, 3225 new OopMap( frame_size_in_words, 0 )); 3226 3227 // Clear fp AND pc 3228 __ reset_last_Java_frame(true); 3229 3230 // Collect return values 3231 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes())); 3232 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes())); 3233 // I think this is useless (throwing pc?) 3234 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes())); 3235 3236 // Pop self-frame. 3237 __ leave(); // Epilog 3238 3239 // Jump to interpreter 3240 __ ret(0); 3241 3242 // Make sure all code is generated 3243 masm->flush(); 3244 3245 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words); 3246 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset); 3247 #if INCLUDE_JVMCI 3248 if (EnableJVMCI) { 3249 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset); 3250 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset); 3251 } 3252 #endif 3253 3254 AOTCodeCache::store_code_blob(*_deopt_blob, AOTCodeEntry::SharedBlob, (uint)SharedStubId::deopt_id, name); 3255 } 3256 3257 //------------------------------generate_handler_blob------ 3258 // 3259 // Generate a special Compile2Runtime blob that saves all registers, 3260 // and setup oopmap. 3261 // 3262 SafepointBlob* SharedRuntime::generate_handler_blob(SharedStubId id, address call_ptr) { 3263 assert(StubRoutines::forward_exception_entry() != nullptr, 3264 "must be generated before"); 3265 assert(is_polling_page_id(id), "expected a polling page stub id"); 3266 3267 // Allocate space for the code. Setup code generation tools. 3268 const char* name = SharedRuntime::stub_name(id); 3269 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, (uint)id, name); 3270 if (blob != nullptr) { 3271 return blob->as_safepoint_blob(); 3272 } 3273 3274 ResourceMark rm; 3275 OopMapSet *oop_maps = new OopMapSet(); 3276 OopMap* map; 3277 CodeBuffer buffer(name, 2548, 1024); 3278 MacroAssembler* masm = new MacroAssembler(&buffer); 3279 3280 address start = __ pc(); 3281 address call_pc = nullptr; 3282 int frame_size_in_words; 3283 bool cause_return = (id == SharedStubId::polling_page_return_handler_id); 3284 bool save_wide_vectors = (id == SharedStubId::polling_page_vectors_safepoint_handler_id); 3285 3286 // Make room for return address (or push it again) 3287 if (!cause_return) { 3288 __ push(rbx); 3289 } 3290 3291 // Save registers, fpu state, and flags 3292 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors); 3293 3294 // The following is basically a call_VM. However, we need the precise 3295 // address of the call in order to generate an oopmap. Hence, we do all the 3296 // work ourselves. 3297 3298 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next: 3299 3300 // The return address must always be correct so that frame constructor never 3301 // sees an invalid pc. 3302 3303 if (!cause_return) { 3304 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack. 3305 // Additionally, rbx is a callee saved register and we can look at it later to determine 3306 // if someone changed the return address for us! 3307 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset())); 3308 __ movptr(Address(rbp, wordSize), rbx); 3309 } 3310 3311 // Do the call 3312 __ mov(c_rarg0, r15_thread); 3313 __ call(RuntimeAddress(call_ptr)); 3314 3315 // Set an oopmap for the call site. This oopmap will map all 3316 // oop-registers and debug-info registers as callee-saved. This 3317 // will allow deoptimization at this safepoint to find all possible 3318 // debug-info recordings, as well as let GC find all oops. 3319 3320 oop_maps->add_gc_map( __ pc() - start, map); 3321 3322 Label noException; 3323 3324 __ reset_last_Java_frame(false); 3325 3326 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3327 __ jcc(Assembler::equal, noException); 3328 3329 // Exception pending 3330 3331 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3332 3333 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3334 3335 // No exception case 3336 __ bind(noException); 3337 3338 Label no_adjust; 3339 #ifdef ASSERT 3340 Label bail; 3341 #endif 3342 if (!cause_return) { 3343 Label no_prefix, not_special, check_rex_prefix; 3344 3345 // If our stashed return pc was modified by the runtime we avoid touching it 3346 __ cmpptr(rbx, Address(rbp, wordSize)); 3347 __ jcc(Assembler::notEqual, no_adjust); 3348 3349 // Skip over the poll instruction. 3350 // See NativeInstruction::is_safepoint_poll() 3351 // Possible encodings: 3352 // 85 00 test %eax,(%rax) 3353 // 85 01 test %eax,(%rcx) 3354 // 85 02 test %eax,(%rdx) 3355 // 85 03 test %eax,(%rbx) 3356 // 85 06 test %eax,(%rsi) 3357 // 85 07 test %eax,(%rdi) 3358 // 3359 // 41 85 00 test %eax,(%r8) 3360 // 41 85 01 test %eax,(%r9) 3361 // 41 85 02 test %eax,(%r10) 3362 // 41 85 03 test %eax,(%r11) 3363 // 41 85 06 test %eax,(%r14) 3364 // 41 85 07 test %eax,(%r15) 3365 // 3366 // 85 04 24 test %eax,(%rsp) 3367 // 41 85 04 24 test %eax,(%r12) 3368 // 85 45 00 test %eax,0x0(%rbp) 3369 // 41 85 45 00 test %eax,0x0(%r13) 3370 // 3371 // Notes: 3372 // Format of legacy MAP0 test instruction:- 3373 // [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32] 3374 // o For safepoint polling instruction "test %eax,(%rax)", encoding of first register 3375 // operand and base register of memory operand is b/w [0-8), hence we do not require 3376 // additional REX prefix where REX.B bit stores MSB bit of register encoding, which 3377 // is why two bytes encoding is sufficient here. 3378 // o For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE 3379 // register of memory operand is 1000, thus we need additional REX prefix in this case, 3380 // there by adding additional byte to instruction encoding. 3381 // o In case BASE register is one of the 32 extended GPR registers available only on targets 3382 // supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold 3383 // most significant two bits of 5 bit register encoding. 3384 3385 if (VM_Version::supports_apx_f()) { 3386 __ cmpb(Address(rbx, 0), Assembler::REX2); 3387 __ jccb(Assembler::notEqual, check_rex_prefix); 3388 __ addptr(rbx, 2); 3389 __ bind(check_rex_prefix); 3390 } 3391 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix); 3392 __ jccb(Assembler::notEqual, no_prefix); 3393 __ addptr(rbx, 1); 3394 __ bind(no_prefix); 3395 #ifdef ASSERT 3396 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below 3397 #endif 3398 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values: 3399 // r12/rsp 0x04 3400 // r13/rbp 0x05 3401 __ movzbq(rcx, Address(rbx, 1)); 3402 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05 3403 __ subptr(rcx, 4); // looking for 0x00 .. 0x01 3404 __ cmpptr(rcx, 1); 3405 __ jccb(Assembler::above, not_special); 3406 __ addptr(rbx, 1); 3407 __ bind(not_special); 3408 #ifdef ASSERT 3409 // Verify the correct encoding of the poll we're about to skip. 3410 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl); 3411 __ jcc(Assembler::notEqual, bail); 3412 // Mask out the modrm bits 3413 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask); 3414 // rax encodes to 0, so if the bits are nonzero it's incorrect 3415 __ jcc(Assembler::notZero, bail); 3416 #endif 3417 // Adjust return pc forward to step over the safepoint poll instruction 3418 __ addptr(rbx, 2); 3419 __ movptr(Address(rbp, wordSize), rbx); 3420 } 3421 3422 __ bind(no_adjust); 3423 // Normal exit, restore registers and exit. 3424 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3425 __ ret(0); 3426 3427 #ifdef ASSERT 3428 __ bind(bail); 3429 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected"); 3430 #endif 3431 3432 // Make sure all code is generated 3433 masm->flush(); 3434 3435 // Fill-out other meta info 3436 SafepointBlob* sp_blob = SafepointBlob::create(&buffer, oop_maps, frame_size_in_words); 3437 3438 AOTCodeCache::store_code_blob(*sp_blob, AOTCodeEntry::SharedBlob, (uint)id, name); 3439 return sp_blob; 3440 } 3441 3442 // 3443 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss 3444 // 3445 // Generate a stub that calls into vm to find out the proper destination 3446 // of a java call. All the argument registers are live at this point 3447 // but since this is generic code we don't know what they are and the caller 3448 // must do any gc of the args. 3449 // 3450 RuntimeStub* SharedRuntime::generate_resolve_blob(SharedStubId id, address destination) { 3451 assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before"); 3452 assert(is_resolve_id(id), "expected a resolve stub id"); 3453 3454 const char* name = SharedRuntime::stub_name(id); 3455 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, (uint)id, name); 3456 if (blob != nullptr) { 3457 return blob->as_runtime_stub(); 3458 } 3459 3460 // allocate space for the code 3461 ResourceMark rm; 3462 CodeBuffer buffer(name, 1552, 512); 3463 MacroAssembler* masm = new MacroAssembler(&buffer); 3464 3465 int frame_size_in_words; 3466 3467 OopMapSet *oop_maps = new OopMapSet(); 3468 OopMap* map = nullptr; 3469 3470 int start = __ offset(); 3471 3472 // No need to save vector registers since they are caller-saved anyway. 3473 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false); 3474 3475 int frame_complete = __ offset(); 3476 3477 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 3478 3479 __ mov(c_rarg0, r15_thread); 3480 3481 __ call(RuntimeAddress(destination)); 3482 3483 3484 // Set an oopmap for the call site. 3485 // We need this not only for callee-saved registers, but also for volatile 3486 // registers that the compiler might be keeping live across a safepoint. 3487 3488 oop_maps->add_gc_map( __ offset() - start, map); 3489 3490 // rax contains the address we are going to jump to assuming no exception got installed 3491 3492 // clear last_Java_sp 3493 __ reset_last_Java_frame(false); 3494 // check for pending exceptions 3495 Label pending; 3496 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3497 __ jcc(Assembler::notEqual, pending); 3498 3499 // get the returned Method* 3500 __ get_vm_result_metadata(rbx); 3501 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx); 3502 3503 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 3504 3505 RegisterSaver::restore_live_registers(masm); 3506 3507 // We are back to the original state on entry and ready to go. 3508 3509 __ jmp(rax); 3510 3511 // Pending exception after the safepoint 3512 3513 __ bind(pending); 3514 3515 RegisterSaver::restore_live_registers(masm); 3516 3517 // exception pending => remove activation and forward to exception handler 3518 3519 __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD); 3520 3521 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 3522 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3523 3524 // ------------- 3525 // make sure all code is generated 3526 masm->flush(); 3527 3528 // return the blob 3529 // frame_size_words or bytes?? 3530 RuntimeStub* rs_blob = RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); 3531 3532 AOTCodeCache::store_code_blob(*rs_blob, AOTCodeEntry::SharedBlob, (uint)id, name); 3533 return rs_blob; 3534 } 3535 3536 // Continuation point for throwing of implicit exceptions that are 3537 // not handled in the current activation. Fabricates an exception 3538 // oop and initiates normal exception dispatching in this 3539 // frame. Since we need to preserve callee-saved values (currently 3540 // only for C2, but done for C1 as well) we need a callee-saved oop 3541 // map and therefore have to make these stubs into RuntimeStubs 3542 // rather than BufferBlobs. If the compiler needs all registers to 3543 // be preserved between the fault point and the exception handler 3544 // then it must assume responsibility for that in 3545 // AbstractCompiler::continuation_for_implicit_null_exception or 3546 // continuation_for_implicit_division_by_zero_exception. All other 3547 // implicit exceptions (e.g., NullPointerException or 3548 // AbstractMethodError on entry) are either at call sites or 3549 // otherwise assume that stack unwinding will be initiated, so 3550 // caller saved registers were assumed volatile in the compiler. 3551 RuntimeStub* SharedRuntime::generate_throw_exception(SharedStubId id, address runtime_entry) { 3552 assert(is_throw_id(id), "expected a throw stub id"); 3553 3554 const char* name = SharedRuntime::stub_name(id); 3555 3556 // Information about frame layout at time of blocking runtime call. 3557 // Note that we only have to preserve callee-saved registers since 3558 // the compilers are responsible for supplying a continuation point 3559 // if they expect all registers to be preserved. 3560 enum layout { 3561 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt, 3562 rbp_off2, 3563 return_off, 3564 return_off2, 3565 framesize // inclusive of return address 3566 }; 3567 3568 int insts_size = 512; 3569 int locs_size = 64; 3570 3571 const char* timer_msg = "SharedRuntime generate_throw_exception"; 3572 TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime)); 3573 3574 CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, (uint)id, name); 3575 if (blob != nullptr) { 3576 return blob->as_runtime_stub(); 3577 } 3578 3579 ResourceMark rm; 3580 CodeBuffer code(name, insts_size, locs_size); 3581 OopMapSet* oop_maps = new OopMapSet(); 3582 MacroAssembler* masm = new MacroAssembler(&code); 3583 3584 address start = __ pc(); 3585 3586 // This is an inlined and slightly modified version of call_VM 3587 // which has the ability to fetch the return PC out of 3588 // thread-local storage and also sets up last_Java_sp slightly 3589 // differently than the real call_VM 3590 3591 __ enter(); // required for proper stackwalking of RuntimeStub frame 3592 3593 assert(is_even(framesize/2), "sp not 16-byte aligned"); 3594 3595 // return address and rbp are already in place 3596 __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog 3597 3598 int frame_complete = __ pc() - start; 3599 3600 // Set up last_Java_sp and last_Java_fp 3601 address the_pc = __ pc(); 3602 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 3603 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack 3604 3605 // Call runtime 3606 __ movptr(c_rarg0, r15_thread); 3607 BLOCK_COMMENT("call runtime_entry"); 3608 __ call(RuntimeAddress(runtime_entry)); 3609 3610 // Generate oop map 3611 OopMap* map = new OopMap(framesize, 0); 3612 3613 oop_maps->add_gc_map(the_pc - start, map); 3614 3615 __ reset_last_Java_frame(true); 3616 3617 __ leave(); // required for proper stackwalking of RuntimeStub frame 3618 3619 // check for pending exceptions 3620 #ifdef ASSERT 3621 Label L; 3622 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3623 __ jcc(Assembler::notEqual, L); 3624 __ should_not_reach_here(); 3625 __ bind(L); 3626 #endif // ASSERT 3627 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3628 3629 3630 // codeBlob framesize is in words (not VMRegImpl::slot_size) 3631 RuntimeStub* stub = 3632 RuntimeStub::new_runtime_stub(name, 3633 &code, 3634 frame_complete, 3635 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3636 oop_maps, false); 3637 AOTCodeCache::store_code_blob(*stub, AOTCodeEntry::SharedBlob, (uint)id, name); 3638 3639 return stub; 3640 } 3641 3642 //------------------------------Montgomery multiplication------------------------ 3643 // 3644 3645 #ifndef _WINDOWS 3646 3647 // Subtract 0:b from carry:a. Return carry. 3648 static julong 3649 sub(julong a[], julong b[], julong carry, long len) { 3650 long long i = 0, cnt = len; 3651 julong tmp; 3652 asm volatile("clc; " 3653 "0: ; " 3654 "mov (%[b], %[i], 8), %[tmp]; " 3655 "sbb %[tmp], (%[a], %[i], 8); " 3656 "inc %[i]; dec %[cnt]; " 3657 "jne 0b; " 3658 "mov %[carry], %[tmp]; sbb $0, %[tmp]; " 3659 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp) 3660 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry) 3661 : "memory"); 3662 return tmp; 3663 } 3664 3665 // Multiply (unsigned) Long A by Long B, accumulating the double- 3666 // length result into the accumulator formed of T0, T1, and T2. 3667 #define MACC(A, B, T0, T1, T2) \ 3668 do { \ 3669 unsigned long hi, lo; \ 3670 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3671 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3672 : "r"(A), "a"(B) : "cc"); \ 3673 } while(0) 3674 3675 // As above, but add twice the double-length result into the 3676 // accumulator. 3677 #define MACC2(A, B, T0, T1, T2) \ 3678 do { \ 3679 unsigned long hi, lo; \ 3680 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \ 3681 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3682 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3683 : "r"(A), "a"(B) : "cc"); \ 3684 } while(0) 3685 3686 #else //_WINDOWS 3687 3688 static julong 3689 sub(julong a[], julong b[], julong carry, long len) { 3690 long i; 3691 julong tmp; 3692 unsigned char c = 1; 3693 for (i = 0; i < len; i++) { 3694 c = _addcarry_u64(c, a[i], ~b[i], &tmp); 3695 a[i] = tmp; 3696 } 3697 c = _addcarry_u64(c, carry, ~0, &tmp); 3698 return tmp; 3699 } 3700 3701 // Multiply (unsigned) Long A by Long B, accumulating the double- 3702 // length result into the accumulator formed of T0, T1, and T2. 3703 #define MACC(A, B, T0, T1, T2) \ 3704 do { \ 3705 julong hi, lo; \ 3706 lo = _umul128(A, B, &hi); \ 3707 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3708 c = _addcarry_u64(c, hi, T1, &T1); \ 3709 _addcarry_u64(c, T2, 0, &T2); \ 3710 } while(0) 3711 3712 // As above, but add twice the double-length result into the 3713 // accumulator. 3714 #define MACC2(A, B, T0, T1, T2) \ 3715 do { \ 3716 julong hi, lo; \ 3717 lo = _umul128(A, B, &hi); \ 3718 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3719 c = _addcarry_u64(c, hi, T1, &T1); \ 3720 _addcarry_u64(c, T2, 0, &T2); \ 3721 c = _addcarry_u64(0, lo, T0, &T0); \ 3722 c = _addcarry_u64(c, hi, T1, &T1); \ 3723 _addcarry_u64(c, T2, 0, &T2); \ 3724 } while(0) 3725 3726 #endif //_WINDOWS 3727 3728 // Fast Montgomery multiplication. The derivation of the algorithm is 3729 // in A Cryptographic Library for the Motorola DSP56000, 3730 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3731 3732 static void NOINLINE 3733 montgomery_multiply(julong a[], julong b[], julong n[], 3734 julong m[], julong inv, int len) { 3735 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3736 int i; 3737 3738 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply"); 3739 3740 for (i = 0; i < len; i++) { 3741 int j; 3742 for (j = 0; j < i; j++) { 3743 MACC(a[j], b[i-j], t0, t1, t2); 3744 MACC(m[j], n[i-j], t0, t1, t2); 3745 } 3746 MACC(a[i], b[0], t0, t1, t2); 3747 m[i] = t0 * inv; 3748 MACC(m[i], n[0], t0, t1, t2); 3749 3750 assert(t0 == 0, "broken Montgomery multiply"); 3751 3752 t0 = t1; t1 = t2; t2 = 0; 3753 } 3754 3755 for (i = len; i < 2*len; i++) { 3756 int j; 3757 for (j = i-len+1; j < len; j++) { 3758 MACC(a[j], b[i-j], t0, t1, t2); 3759 MACC(m[j], n[i-j], t0, t1, t2); 3760 } 3761 m[i-len] = t0; 3762 t0 = t1; t1 = t2; t2 = 0; 3763 } 3764 3765 while (t0) 3766 t0 = sub(m, n, t0, len); 3767 } 3768 3769 // Fast Montgomery squaring. This uses asymptotically 25% fewer 3770 // multiplies so it should be up to 25% faster than Montgomery 3771 // multiplication. However, its loop control is more complex and it 3772 // may actually run slower on some machines. 3773 3774 static void NOINLINE 3775 montgomery_square(julong a[], julong n[], 3776 julong m[], julong inv, int len) { 3777 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3778 int i; 3779 3780 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square"); 3781 3782 for (i = 0; i < len; i++) { 3783 int j; 3784 int end = (i+1)/2; 3785 for (j = 0; j < end; j++) { 3786 MACC2(a[j], a[i-j], t0, t1, t2); 3787 MACC(m[j], n[i-j], t0, t1, t2); 3788 } 3789 if ((i & 1) == 0) { 3790 MACC(a[j], a[j], t0, t1, t2); 3791 } 3792 for (; j < i; j++) { 3793 MACC(m[j], n[i-j], t0, t1, t2); 3794 } 3795 m[i] = t0 * inv; 3796 MACC(m[i], n[0], t0, t1, t2); 3797 3798 assert(t0 == 0, "broken Montgomery square"); 3799 3800 t0 = t1; t1 = t2; t2 = 0; 3801 } 3802 3803 for (i = len; i < 2*len; i++) { 3804 int start = i-len+1; 3805 int end = start + (len - start)/2; 3806 int j; 3807 for (j = start; j < end; j++) { 3808 MACC2(a[j], a[i-j], t0, t1, t2); 3809 MACC(m[j], n[i-j], t0, t1, t2); 3810 } 3811 if ((i & 1) == 0) { 3812 MACC(a[j], a[j], t0, t1, t2); 3813 } 3814 for (; j < len; j++) { 3815 MACC(m[j], n[i-j], t0, t1, t2); 3816 } 3817 m[i-len] = t0; 3818 t0 = t1; t1 = t2; t2 = 0; 3819 } 3820 3821 while (t0) 3822 t0 = sub(m, n, t0, len); 3823 } 3824 3825 // Swap words in a longword. 3826 static julong swap(julong x) { 3827 return (x << 32) | (x >> 32); 3828 } 3829 3830 // Copy len longwords from s to d, word-swapping as we go. The 3831 // destination array is reversed. 3832 static void reverse_words(julong *s, julong *d, int len) { 3833 d += len; 3834 while(len-- > 0) { 3835 d--; 3836 *d = swap(*s); 3837 s++; 3838 } 3839 } 3840 3841 // The threshold at which squaring is advantageous was determined 3842 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. 3843 #define MONTGOMERY_SQUARING_THRESHOLD 64 3844 3845 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, 3846 jint len, jlong inv, 3847 jint *m_ints) { 3848 assert(len % 2 == 0, "array length in montgomery_multiply must be even"); 3849 int longwords = len/2; 3850 3851 // Make very sure we don't use so much space that the stack might 3852 // overflow. 512 jints corresponds to an 16384-bit integer and 3853 // will use here a total of 8k bytes of stack space. 3854 int divisor = sizeof(julong) * 4; 3855 guarantee(longwords <= 8192 / divisor, "must be"); 3856 int total_allocation = longwords * sizeof (julong) * 4; 3857 julong *scratch = (julong *)alloca(total_allocation); 3858 3859 // Local scratch arrays 3860 julong 3861 *a = scratch + 0 * longwords, 3862 *b = scratch + 1 * longwords, 3863 *n = scratch + 2 * longwords, 3864 *m = scratch + 3 * longwords; 3865 3866 reverse_words((julong *)a_ints, a, longwords); 3867 reverse_words((julong *)b_ints, b, longwords); 3868 reverse_words((julong *)n_ints, n, longwords); 3869 3870 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords); 3871 3872 reverse_words(m, (julong *)m_ints, longwords); 3873 } 3874 3875 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, 3876 jint len, jlong inv, 3877 jint *m_ints) { 3878 assert(len % 2 == 0, "array length in montgomery_square must be even"); 3879 int longwords = len/2; 3880 3881 // Make very sure we don't use so much space that the stack might 3882 // overflow. 512 jints corresponds to an 16384-bit integer and 3883 // will use here a total of 6k bytes of stack space. 3884 int divisor = sizeof(julong) * 3; 3885 guarantee(longwords <= (8192 / divisor), "must be"); 3886 int total_allocation = longwords * sizeof (julong) * 3; 3887 julong *scratch = (julong *)alloca(total_allocation); 3888 3889 // Local scratch arrays 3890 julong 3891 *a = scratch + 0 * longwords, 3892 *n = scratch + 1 * longwords, 3893 *m = scratch + 2 * longwords; 3894 3895 reverse_words((julong *)a_ints, a, longwords); 3896 reverse_words((julong *)n_ints, n, longwords); 3897 3898 if (len >= MONTGOMERY_SQUARING_THRESHOLD) { 3899 ::montgomery_square(a, n, m, (julong)inv, longwords); 3900 } else { 3901 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords); 3902 } 3903 3904 reverse_words(m, (julong *)m_ints, longwords); 3905 } 3906 3907 BufferedInlineTypeBlob* SharedRuntime::generate_buffered_inline_type_adapter(const InlineKlass* vk) { 3908 BufferBlob* buf = BufferBlob::create("inline types pack/unpack", 16 * K); 3909 CodeBuffer buffer(buf); 3910 short buffer_locs[20]; 3911 buffer.insts()->initialize_shared_locs((relocInfo*)buffer_locs, 3912 sizeof(buffer_locs)/sizeof(relocInfo)); 3913 3914 MacroAssembler* masm = new MacroAssembler(&buffer); 3915 3916 const Array<SigEntry>* sig_vk = vk->extended_sig(); 3917 const Array<VMRegPair>* regs = vk->return_regs(); 3918 3919 int pack_fields_jobject_off = __ offset(); 3920 // Resolve pre-allocated buffer from JNI handle. 3921 // We cannot do this in generate_call_stub() because it requires GC code to be initialized. 3922 __ movptr(rax, Address(r13, 0)); 3923 __ resolve_jobject(rax /* value */, 3924 r12 /* tmp */); 3925 __ movptr(Address(r13, 0), rax); 3926 3927 int pack_fields_off = __ offset(); 3928 3929 int j = 1; 3930 for (int i = 0; i < sig_vk->length(); i++) { 3931 BasicType bt = sig_vk->at(i)._bt; 3932 if (bt == T_METADATA) { 3933 continue; 3934 } 3935 if (bt == T_VOID) { 3936 if (sig_vk->at(i-1)._bt == T_LONG || 3937 sig_vk->at(i-1)._bt == T_DOUBLE) { 3938 j++; 3939 } 3940 continue; 3941 } 3942 int off = sig_vk->at(i)._offset; 3943 assert(off > 0, "offset in object should be positive"); 3944 VMRegPair pair = regs->at(j); 3945 VMReg r_1 = pair.first(); 3946 VMReg r_2 = pair.second(); 3947 Address to(rax, off); 3948 if (bt == T_FLOAT) { 3949 __ movflt(to, r_1->as_XMMRegister()); 3950 } else if (bt == T_DOUBLE) { 3951 __ movdbl(to, r_1->as_XMMRegister()); 3952 } else { 3953 Register val = r_1->as_Register(); 3954 assert_different_registers(to.base(), val, r14, r13, rbx, rscratch1); 3955 if (is_reference_type(bt)) { 3956 __ store_heap_oop(to, val, r14, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED); 3957 } else { 3958 __ store_sized_value(to, r_1->as_Register(), type2aelembytes(bt)); 3959 } 3960 } 3961 j++; 3962 } 3963 assert(j == regs->length(), "missed a field?"); 3964 if (vk->has_nullable_atomic_layout()) { 3965 // Set the null marker 3966 __ movb(Address(rax, vk->null_marker_offset()), 1); 3967 } 3968 __ ret(0); 3969 3970 int unpack_fields_off = __ offset(); 3971 3972 Label skip; 3973 Label not_null; 3974 __ testptr(rax, rax); 3975 __ jcc(Assembler::notZero, not_null); 3976 3977 // Return value is null. Zero oop registers to make the GC happy. 3978 j = 1; 3979 for (int i = 0; i < sig_vk->length(); i++) { 3980 BasicType bt = sig_vk->at(i)._bt; 3981 if (bt == T_METADATA) { 3982 continue; 3983 } 3984 if (bt == T_VOID) { 3985 if (sig_vk->at(i-1)._bt == T_LONG || 3986 sig_vk->at(i-1)._bt == T_DOUBLE) { 3987 j++; 3988 } 3989 continue; 3990 } 3991 if (bt == T_OBJECT || bt == T_ARRAY) { 3992 VMRegPair pair = regs->at(j); 3993 VMReg r_1 = pair.first(); 3994 __ xorq(r_1->as_Register(), r_1->as_Register()); 3995 } 3996 j++; 3997 } 3998 __ jmp(skip); 3999 __ bind(not_null); 4000 4001 j = 1; 4002 for (int i = 0; i < sig_vk->length(); i++) { 4003 BasicType bt = sig_vk->at(i)._bt; 4004 if (bt == T_METADATA) { 4005 continue; 4006 } 4007 if (bt == T_VOID) { 4008 if (sig_vk->at(i-1)._bt == T_LONG || 4009 sig_vk->at(i-1)._bt == T_DOUBLE) { 4010 j++; 4011 } 4012 continue; 4013 } 4014 int off = sig_vk->at(i)._offset; 4015 assert(off > 0, "offset in object should be positive"); 4016 VMRegPair pair = regs->at(j); 4017 VMReg r_1 = pair.first(); 4018 VMReg r_2 = pair.second(); 4019 Address from(rax, off); 4020 if (bt == T_FLOAT) { 4021 __ movflt(r_1->as_XMMRegister(), from); 4022 } else if (bt == T_DOUBLE) { 4023 __ movdbl(r_1->as_XMMRegister(), from); 4024 } else if (bt == T_OBJECT || bt == T_ARRAY) { 4025 assert_different_registers(rax, r_1->as_Register()); 4026 __ load_heap_oop(r_1->as_Register(), from); 4027 } else { 4028 assert(is_java_primitive(bt), "unexpected basic type"); 4029 assert_different_registers(rax, r_1->as_Register()); 4030 size_t size_in_bytes = type2aelembytes(bt); 4031 __ load_sized_value(r_1->as_Register(), from, size_in_bytes, bt != T_CHAR && bt != T_BOOLEAN); 4032 } 4033 j++; 4034 } 4035 assert(j == regs->length(), "missed a field?"); 4036 4037 __ bind(skip); 4038 __ ret(0); 4039 4040 __ flush(); 4041 4042 return BufferedInlineTypeBlob::create(&buffer, pack_fields_off, pack_fields_jobject_off, unpack_fields_off); 4043 } 4044 4045 #if INCLUDE_JFR 4046 4047 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint. 4048 // It returns a jobject handle to the event writer. 4049 // The handle is dereferenced and the return value is the event writer oop. 4050 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() { 4051 enum layout { 4052 rbp_off, 4053 rbpH_off, 4054 return_off, 4055 return_off2, 4056 framesize // inclusive of return address 4057 }; 4058 4059 const char* name = SharedRuntime::stub_name(SharedStubId::jfr_write_checkpoint_id); 4060 CodeBuffer code(name, 1024, 64); 4061 MacroAssembler* masm = new MacroAssembler(&code); 4062 address start = __ pc(); 4063 4064 __ enter(); 4065 address the_pc = __ pc(); 4066 4067 int frame_complete = the_pc - start; 4068 4069 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 4070 __ movptr(c_rarg0, r15_thread); 4071 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1); 4072 __ reset_last_Java_frame(true); 4073 4074 // rax is jobject handle result, unpack and process it through a barrier. 4075 __ resolve_global_jobject(rax, c_rarg0); 4076 4077 __ leave(); 4078 __ ret(0); 4079 4080 OopMapSet* oop_maps = new OopMapSet(); 4081 OopMap* map = new OopMap(framesize, 1); 4082 oop_maps->add_gc_map(frame_complete, map); 4083 4084 RuntimeStub* stub = 4085 RuntimeStub::new_runtime_stub(name, 4086 &code, 4087 frame_complete, 4088 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4089 oop_maps, 4090 false); 4091 return stub; 4092 } 4093 4094 // For c2: call to return a leased buffer. 4095 RuntimeStub* SharedRuntime::generate_jfr_return_lease() { 4096 enum layout { 4097 rbp_off, 4098 rbpH_off, 4099 return_off, 4100 return_off2, 4101 framesize // inclusive of return address 4102 }; 4103 4104 const char* name = SharedRuntime::stub_name(SharedStubId::jfr_return_lease_id); 4105 CodeBuffer code(name, 1024, 64); 4106 MacroAssembler* masm = new MacroAssembler(&code); 4107 address start = __ pc(); 4108 4109 __ enter(); 4110 address the_pc = __ pc(); 4111 4112 int frame_complete = the_pc - start; 4113 4114 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2); 4115 __ movptr(c_rarg0, r15_thread); 4116 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1); 4117 __ reset_last_Java_frame(true); 4118 4119 __ leave(); 4120 __ ret(0); 4121 4122 OopMapSet* oop_maps = new OopMapSet(); 4123 OopMap* map = new OopMap(framesize, 1); 4124 oop_maps->add_gc_map(frame_complete, map); 4125 4126 RuntimeStub* stub = 4127 RuntimeStub::new_runtime_stub(name, 4128 &code, 4129 frame_complete, 4130 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4131 oop_maps, 4132 false); 4133 return stub; 4134 } 4135 4136 #endif // INCLUDE_JFR