1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #ifndef _WINDOWS 26 #include "alloca.h" 27 #endif 28 #include "asm/macroAssembler.hpp" 29 #include "asm/macroAssembler.inline.hpp" 30 #include "classfile/symbolTable.hpp" 31 #include "code/compiledIC.hpp" 32 #include "code/debugInfoRec.hpp" 33 #include "code/nativeInst.hpp" 34 #include "code/vtableStubs.hpp" 35 #include "compiler/oopMap.hpp" 36 #include "gc/shared/collectedHeap.hpp" 37 #include "gc/shared/gcLocker.hpp" 38 #include "gc/shared/barrierSet.hpp" 39 #include "gc/shared/barrierSetAssembler.hpp" 40 #include "interpreter/interpreter.hpp" 41 #include "logging/log.hpp" 42 #include "memory/resourceArea.hpp" 43 #include "memory/universe.hpp" 44 #include "oops/klass.inline.hpp" 45 #include "oops/method.inline.hpp" 46 #include "prims/methodHandles.hpp" 47 #include "runtime/continuation.hpp" 48 #include "runtime/continuationEntry.inline.hpp" 49 #include "runtime/globals.hpp" 50 #include "runtime/jniHandles.hpp" 51 #include "runtime/safepointMechanism.hpp" 52 #include "runtime/sharedRuntime.hpp" 53 #include "runtime/signature.hpp" 54 #include "runtime/stubRoutines.hpp" 55 #include "runtime/timerTrace.hpp" 56 #include "runtime/vframeArray.hpp" 57 #include "runtime/vm_version.hpp" 58 #include "utilities/align.hpp" 59 #include "utilities/checkedCast.hpp" 60 #include "utilities/formatBuffer.hpp" 61 #include "vmreg_x86.inline.hpp" 62 #ifdef COMPILER1 63 #include "c1/c1_Runtime1.hpp" 64 #endif 65 #ifdef COMPILER2 66 #include "opto/runtime.hpp" 67 #endif 68 #if INCLUDE_JVMCI 69 #include "jvmci/jvmciJavaClasses.hpp" 70 #endif 71 72 #define __ masm-> 73 74 #ifdef PRODUCT 75 #define BLOCK_COMMENT(str) /* nothing */ 76 #else 77 #define BLOCK_COMMENT(str) __ block_comment(str) 78 #endif // PRODUCT 79 80 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size; 81 82 class RegisterSaver { 83 // Capture info about frame layout. Layout offsets are in jint 84 // units because compiler frame slots are jints. 85 #define XSAVE_AREA_BEGIN 160 86 #define XSAVE_AREA_YMM_BEGIN 576 87 #define XSAVE_AREA_EGPRS 960 88 #define XSAVE_AREA_OPMASK_BEGIN 1088 89 #define XSAVE_AREA_ZMM_BEGIN 1152 90 #define XSAVE_AREA_UPPERBANK 1664 91 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off 92 #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off 93 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off 94 #define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off 95 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off 96 enum layout { 97 fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area 98 xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area 99 DEF_XMM_OFFS(0), 100 DEF_XMM_OFFS(1), 101 // 2..15 are implied in range usage 102 ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 103 DEF_YMM_OFFS(0), 104 DEF_YMM_OFFS(1), 105 r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt, 106 r16H_off, 107 r17_off, r17H_off, 108 r18_off, r18H_off, 109 r19_off, r19H_off, 110 r20_off, r20H_off, 111 r21_off, r21H_off, 112 r22_off, r22H_off, 113 r23_off, r23H_off, 114 r24_off, r24H_off, 115 r25_off, r25H_off, 116 r26_off, r26H_off, 117 r27_off, r27H_off, 118 r28_off, r28H_off, 119 r29_off, r29H_off, 120 r30_off, r30H_off, 121 r31_off, r31H_off, 122 opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 123 DEF_OPMASK_OFFS(0), 124 DEF_OPMASK_OFFS(1), 125 // 2..7 are implied in range usage 126 zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, 127 DEF_ZMM_OFFS(0), 128 DEF_ZMM_OFFS(1), 129 zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt, 130 DEF_ZMM_UPPER_OFFS(16), 131 DEF_ZMM_UPPER_OFFS(17), 132 // 18..31 are implied in range usage 133 fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt), 134 fpu_stateH_end, 135 r15_off, r15H_off, 136 r14_off, r14H_off, 137 r13_off, r13H_off, 138 r12_off, r12H_off, 139 r11_off, r11H_off, 140 r10_off, r10H_off, 141 r9_off, r9H_off, 142 r8_off, r8H_off, 143 rdi_off, rdiH_off, 144 rsi_off, rsiH_off, 145 ignore_off, ignoreH_off, // extra copy of rbp 146 rsp_off, rspH_off, 147 rbx_off, rbxH_off, 148 rdx_off, rdxH_off, 149 rcx_off, rcxH_off, 150 rax_off, raxH_off, 151 // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state 152 align_off, alignH_off, 153 flags_off, flagsH_off, 154 // The frame sender code expects that rbp will be in the "natural" place and 155 // will override any oopMap setting for it. We must therefore force the layout 156 // so that it agrees with the frame sender code. 157 rbp_off, rbpH_off, // copy of rbp we will restore 158 return_off, returnH_off, // slot for return address 159 reg_save_size // size in compiler stack slots 160 }; 161 162 public: 163 static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors); 164 static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false); 165 166 // Offsets into the register save area 167 // Used by deoptimization when it is managing result register 168 // values on its own 169 170 static int rax_offset_in_bytes(void) { return BytesPerInt * rax_off; } 171 static int rdx_offset_in_bytes(void) { return BytesPerInt * rdx_off; } 172 static int rbx_offset_in_bytes(void) { return BytesPerInt * rbx_off; } 173 static int r15_offset_in_bytes(void) { return BytesPerInt * r15_off; } 174 static int xmm0_offset_in_bytes(void) { return BytesPerInt * xmm0_off; } 175 static int return_offset_in_bytes(void) { return BytesPerInt * return_off; } 176 177 // During deoptimization only the result registers need to be restored, 178 // all the other values have already been extracted. 179 static void restore_result_registers(MacroAssembler* masm); 180 }; 181 182 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) { 183 int off = 0; 184 int num_xmm_regs = XMMRegister::available_xmm_registers(); 185 #if COMPILER2_OR_JVMCI 186 if (save_wide_vectors && UseAVX == 0) { 187 save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX 188 } 189 assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 190 #else 191 save_wide_vectors = false; // vectors are generated only by C2 and JVMCI 192 #endif 193 194 // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated 195 int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs); 196 // OopMap frame size is in compiler stack slots (jint's) not bytes or words 197 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 198 // CodeBlob frame size is in words. 199 int frame_size_in_words = frame_size_in_bytes / wordSize; 200 *total_frame_words = frame_size_in_words; 201 202 // Save registers, fpu state, and flags. 203 // We assume caller has already pushed the return address onto the 204 // stack, so rsp is 8-byte aligned here. 205 // We push rpb twice in this sequence because we want the real rbp 206 // to be under the return like a normal enter. 207 208 __ enter(); // rsp becomes 16-byte aligned here 209 __ pushf(); 210 // Make sure rsp stays 16-byte aligned 211 __ subq(rsp, 8); 212 // Push CPU state in multiple of 16 bytes 213 __ save_legacy_gprs(); 214 __ push_FPU_state(); 215 216 217 // push cpu state handles this on EVEX enabled targets 218 if (save_wide_vectors) { 219 // Save upper half of YMM registers(0..15) 220 int base_addr = XSAVE_AREA_YMM_BEGIN; 221 for (int n = 0; n < 16; n++) { 222 __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n)); 223 } 224 if (VM_Version::supports_evex()) { 225 // Save upper half of ZMM registers(0..15) 226 base_addr = XSAVE_AREA_ZMM_BEGIN; 227 for (int n = 0; n < 16; n++) { 228 __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n)); 229 } 230 // Save full ZMM registers(16..num_xmm_regs) 231 base_addr = XSAVE_AREA_UPPERBANK; 232 off = 0; 233 int vector_len = Assembler::AVX_512bit; 234 for (int n = 16; n < num_xmm_regs; n++) { 235 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 236 } 237 #if COMPILER2_OR_JVMCI 238 base_addr = XSAVE_AREA_OPMASK_BEGIN; 239 off = 0; 240 for(int n = 0; n < KRegister::number_of_registers; n++) { 241 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 242 } 243 #endif 244 } 245 } else { 246 if (VM_Version::supports_evex()) { 247 // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 248 int base_addr = XSAVE_AREA_UPPERBANK; 249 off = 0; 250 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 251 for (int n = 16; n < num_xmm_regs; n++) { 252 __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); 253 } 254 #if COMPILER2_OR_JVMCI 255 base_addr = XSAVE_AREA_OPMASK_BEGIN; 256 off = 0; 257 for(int n = 0; n < KRegister::number_of_registers; n++) { 258 __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); 259 } 260 #endif 261 } 262 } 263 264 #if COMPILER2_OR_JVMCI 265 if (UseAPX) { 266 int base_addr = XSAVE_AREA_EGPRS; 267 off = 0; 268 for (int n = 16; n < Register::number_of_registers; n++) { 269 __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n)); 270 } 271 } 272 #endif 273 274 __ vzeroupper(); 275 if (frame::arg_reg_save_area_bytes != 0) { 276 // Allocate argument register save area 277 __ subptr(rsp, frame::arg_reg_save_area_bytes); 278 } 279 280 // Set an oopmap for the call site. This oopmap will map all 281 // oop-registers and debug-info registers as callee-saved. This 282 // will allow deoptimization at this safepoint to find all possible 283 // debug-info recordings, as well as let GC find all oops. 284 285 OopMapSet *oop_maps = new OopMapSet(); 286 OopMap* map = new OopMap(frame_size_in_slots, 0); 287 288 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x)) 289 290 map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg()); 291 map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg()); 292 map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg()); 293 map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg()); 294 // rbp location is known implicitly by the frame sender code, needs no oopmap 295 // and the location where rbp was saved by is ignored 296 map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg()); 297 map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg()); 298 map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg()); 299 map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg()); 300 map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg()); 301 map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg()); 302 map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg()); 303 map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg()); 304 map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg()); 305 map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg()); 306 307 if (UseAPX) { 308 map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg()); 309 map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg()); 310 map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg()); 311 map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg()); 312 map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg()); 313 map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg()); 314 map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg()); 315 map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg()); 316 map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg()); 317 map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg()); 318 map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg()); 319 map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg()); 320 map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg()); 321 map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg()); 322 map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg()); 323 map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg()); 324 } 325 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 326 // on EVEX enabled targets, we get it included in the xsave area 327 off = xmm0_off; 328 int delta = xmm1_off - off; 329 for (int n = 0; n < 16; n++) { 330 XMMRegister xmm_name = as_XMMRegister(n); 331 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()); 332 off += delta; 333 } 334 if (UseAVX > 2) { 335 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 336 off = zmm16_off; 337 delta = zmm17_off - off; 338 for (int n = 16; n < num_xmm_regs; n++) { 339 XMMRegister zmm_name = as_XMMRegister(n); 340 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()); 341 off += delta; 342 } 343 } 344 345 #if COMPILER2_OR_JVMCI 346 if (save_wide_vectors) { 347 // Save upper half of YMM registers(0..15) 348 off = ymm0_off; 349 delta = ymm1_off - ymm0_off; 350 for (int n = 0; n < 16; n++) { 351 XMMRegister ymm_name = as_XMMRegister(n); 352 map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4)); 353 off += delta; 354 } 355 if (VM_Version::supports_evex()) { 356 // Save upper half of ZMM registers(0..15) 357 off = zmm0_off; 358 delta = zmm1_off - zmm0_off; 359 for (int n = 0; n < 16; n++) { 360 XMMRegister zmm_name = as_XMMRegister(n); 361 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8)); 362 off += delta; 363 } 364 } 365 } 366 #endif // COMPILER2_OR_JVMCI 367 368 // %%% These should all be a waste but we'll keep things as they were for now 369 if (true) { 370 map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next()); 371 map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next()); 372 map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next()); 373 map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next()); 374 // rbp location is known implicitly by the frame sender code, needs no oopmap 375 map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next()); 376 map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next()); 377 map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next()); 378 map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next()); 379 map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next()); 380 map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next()); 381 map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next()); 382 map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next()); 383 map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next()); 384 map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next()); 385 if (UseAPX) { 386 map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next()); 387 map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next()); 388 map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next()); 389 map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next()); 390 map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next()); 391 map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next()); 392 map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next()); 393 map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next()); 394 map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next()); 395 map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next()); 396 map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next()); 397 map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next()); 398 map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next()); 399 map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next()); 400 map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next()); 401 map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next()); 402 } 403 // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, 404 // on EVEX enabled targets, we get it included in the xsave area 405 off = xmm0H_off; 406 delta = xmm1H_off - off; 407 for (int n = 0; n < 16; n++) { 408 XMMRegister xmm_name = as_XMMRegister(n); 409 map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next()); 410 off += delta; 411 } 412 if (UseAVX > 2) { 413 // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets 414 off = zmm16H_off; 415 delta = zmm17H_off - off; 416 for (int n = 16; n < num_xmm_regs; n++) { 417 XMMRegister zmm_name = as_XMMRegister(n); 418 map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next()); 419 off += delta; 420 } 421 } 422 } 423 424 return map; 425 } 426 427 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) { 428 int num_xmm_regs = XMMRegister::available_xmm_registers(); 429 if (frame::arg_reg_save_area_bytes != 0) { 430 // Pop arg register save area 431 __ addptr(rsp, frame::arg_reg_save_area_bytes); 432 } 433 434 #if COMPILER2_OR_JVMCI 435 if (restore_wide_vectors) { 436 assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); 437 assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); 438 } 439 #else 440 assert(!restore_wide_vectors, "vectors are generated only by C2"); 441 #endif 442 443 __ vzeroupper(); 444 445 // On EVEX enabled targets everything is handled in pop fpu state 446 if (restore_wide_vectors) { 447 // Restore upper half of YMM registers (0..15) 448 int base_addr = XSAVE_AREA_YMM_BEGIN; 449 for (int n = 0; n < 16; n++) { 450 __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16)); 451 } 452 if (VM_Version::supports_evex()) { 453 // Restore upper half of ZMM registers (0..15) 454 base_addr = XSAVE_AREA_ZMM_BEGIN; 455 for (int n = 0; n < 16; n++) { 456 __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32)); 457 } 458 // Restore full ZMM registers(16..num_xmm_regs) 459 base_addr = XSAVE_AREA_UPPERBANK; 460 int vector_len = Assembler::AVX_512bit; 461 int off = 0; 462 for (int n = 16; n < num_xmm_regs; n++) { 463 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 464 } 465 #if COMPILER2_OR_JVMCI 466 base_addr = XSAVE_AREA_OPMASK_BEGIN; 467 off = 0; 468 for (int n = 0; n < KRegister::number_of_registers; n++) { 469 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 470 } 471 #endif 472 } 473 } else { 474 if (VM_Version::supports_evex()) { 475 // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage 476 int base_addr = XSAVE_AREA_UPPERBANK; 477 int off = 0; 478 int vector_len = VM_Version::supports_avx512vl() ? Assembler::AVX_128bit : Assembler::AVX_512bit; 479 for (int n = 16; n < num_xmm_regs; n++) { 480 __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); 481 } 482 #if COMPILER2_OR_JVMCI 483 base_addr = XSAVE_AREA_OPMASK_BEGIN; 484 off = 0; 485 for (int n = 0; n < KRegister::number_of_registers; n++) { 486 __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); 487 } 488 #endif 489 } 490 } 491 492 #if COMPILER2_OR_JVMCI 493 if (UseAPX) { 494 int base_addr = XSAVE_AREA_EGPRS; 495 int off = 0; 496 for (int n = 16; n < Register::number_of_registers; n++) { 497 __ movq(as_Register(n), Address(rsp, base_addr+(off++*8))); 498 } 499 } 500 #endif 501 502 // Recover CPU state 503 __ pop_FPU_state(); 504 __ restore_legacy_gprs(); 505 __ addq(rsp, 8); 506 __ popf(); 507 // Get the rbp described implicitly by the calling convention (no oopMap) 508 __ pop(rbp); 509 } 510 511 void RegisterSaver::restore_result_registers(MacroAssembler* masm) { 512 513 // Just restore result register. Only used by deoptimization. By 514 // now any callee save register that needs to be restored to a c2 515 // caller of the deoptee has been extracted into the vframeArray 516 // and will be stuffed into the c2i adapter we create for later 517 // restoration so only result registers need to be restored here. 518 519 // Restore fp result register 520 __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes())); 521 // Restore integer result register 522 __ movptr(rax, Address(rsp, rax_offset_in_bytes())); 523 __ movptr(rdx, Address(rsp, rdx_offset_in_bytes())); 524 525 // Pop all of the register save are off the stack except the return address 526 __ addptr(rsp, return_offset_in_bytes()); 527 } 528 529 // Is vector's size (in bytes) bigger than a size saved by default? 530 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions. 531 bool SharedRuntime::is_wide_vector(int size) { 532 return size > 16; 533 } 534 535 // --------------------------------------------------------------------------- 536 // Read the array of BasicTypes from a signature, and compute where the 537 // arguments should go. Values in the VMRegPair regs array refer to 4-byte 538 // quantities. Values less than VMRegImpl::stack0 are registers, those above 539 // refer to 4-byte stack slots. All stack slots are based off of the stack pointer 540 // as framesizes are fixed. 541 // VMRegImpl::stack0 refers to the first slot 0(sp). 542 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher. 543 // Register up to Register::number_of_registers are the 64-bit 544 // integer registers. 545 546 // Note: the INPUTS in sig_bt are in units of Java argument words, which are 547 // either 32-bit or 64-bit depending on the build. The OUTPUTS are in 32-bit 548 // units regardless of build. Of course for i486 there is no 64 bit build 549 550 // The Java calling convention is a "shifted" version of the C ABI. 551 // By skipping the first C ABI register we can call non-static jni methods 552 // with small numbers of arguments without having to shuffle the arguments 553 // at all. Since we control the java ABI we ought to at least get some 554 // advantage out of it. 555 556 int SharedRuntime::java_calling_convention(const BasicType *sig_bt, 557 VMRegPair *regs, 558 int total_args_passed) { 559 560 // Create the mapping between argument positions and 561 // registers. 562 static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = { 563 j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5 564 }; 565 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = { 566 j_farg0, j_farg1, j_farg2, j_farg3, 567 j_farg4, j_farg5, j_farg6, j_farg7 568 }; 569 570 571 uint int_args = 0; 572 uint fp_args = 0; 573 uint stk_args = 0; 574 575 for (int i = 0; i < total_args_passed; i++) { 576 switch (sig_bt[i]) { 577 case T_BOOLEAN: 578 case T_CHAR: 579 case T_BYTE: 580 case T_SHORT: 581 case T_INT: 582 if (int_args < Argument::n_int_register_parameters_j) { 583 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 584 } else { 585 stk_args = align_up(stk_args, 2); 586 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 587 stk_args += 1; 588 } 589 break; 590 case T_VOID: 591 // halves of T_LONG or T_DOUBLE 592 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 593 regs[i].set_bad(); 594 break; 595 case T_LONG: 596 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 597 // fall through 598 case T_OBJECT: 599 case T_ARRAY: 600 case T_ADDRESS: 601 if (int_args < Argument::n_int_register_parameters_j) { 602 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 603 } else { 604 stk_args = align_up(stk_args, 2); 605 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 606 stk_args += 2; 607 } 608 break; 609 case T_FLOAT: 610 if (fp_args < Argument::n_float_register_parameters_j) { 611 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 612 } else { 613 stk_args = align_up(stk_args, 2); 614 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 615 stk_args += 1; 616 } 617 break; 618 case T_DOUBLE: 619 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 620 if (fp_args < Argument::n_float_register_parameters_j) { 621 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 622 } else { 623 stk_args = align_up(stk_args, 2); 624 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 625 stk_args += 2; 626 } 627 break; 628 default: 629 ShouldNotReachHere(); 630 break; 631 } 632 } 633 634 return stk_args; 635 } 636 637 // Same as java_calling_convention() but for multiple return 638 // values. There's no way to store them on the stack so if we don't 639 // have enough registers, multiple values can't be returned. 640 const uint SharedRuntime::java_return_convention_max_int = Argument::n_int_register_parameters_j+1; 641 const uint SharedRuntime::java_return_convention_max_float = Argument::n_float_register_parameters_j; 642 int SharedRuntime::java_return_convention(const BasicType *sig_bt, 643 VMRegPair *regs, 644 int total_args_passed) { 645 // Create the mapping between argument positions and 646 // registers. 647 static const Register INT_ArgReg[java_return_convention_max_int] = { 648 rax, j_rarg5, j_rarg4, j_rarg3, j_rarg2, j_rarg1, j_rarg0 649 }; 650 static const XMMRegister FP_ArgReg[java_return_convention_max_float] = { 651 j_farg0, j_farg1, j_farg2, j_farg3, 652 j_farg4, j_farg5, j_farg6, j_farg7 653 }; 654 655 656 uint int_args = 0; 657 uint fp_args = 0; 658 659 for (int i = 0; i < total_args_passed; i++) { 660 switch (sig_bt[i]) { 661 case T_BOOLEAN: 662 case T_CHAR: 663 case T_BYTE: 664 case T_SHORT: 665 case T_INT: 666 if (int_args < Argument::n_int_register_parameters_j+1) { 667 regs[i].set1(INT_ArgReg[int_args]->as_VMReg()); 668 int_args++; 669 } else { 670 return -1; 671 } 672 break; 673 case T_VOID: 674 // halves of T_LONG or T_DOUBLE 675 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 676 regs[i].set_bad(); 677 break; 678 case T_LONG: 679 assert(sig_bt[i + 1] == T_VOID, "expecting half"); 680 // fall through 681 case T_OBJECT: 682 case T_ARRAY: 683 case T_ADDRESS: 684 case T_METADATA: 685 if (int_args < Argument::n_int_register_parameters_j+1) { 686 regs[i].set2(INT_ArgReg[int_args]->as_VMReg()); 687 int_args++; 688 } else { 689 return -1; 690 } 691 break; 692 case T_FLOAT: 693 if (fp_args < Argument::n_float_register_parameters_j) { 694 regs[i].set1(FP_ArgReg[fp_args]->as_VMReg()); 695 fp_args++; 696 } else { 697 return -1; 698 } 699 break; 700 case T_DOUBLE: 701 assert(sig_bt[i + 1] == T_VOID, "expecting half"); 702 if (fp_args < Argument::n_float_register_parameters_j) { 703 regs[i].set2(FP_ArgReg[fp_args]->as_VMReg()); 704 fp_args++; 705 } else { 706 return -1; 707 } 708 break; 709 default: 710 ShouldNotReachHere(); 711 break; 712 } 713 } 714 715 return int_args + fp_args; 716 } 717 718 // Patch the callers callsite with entry to compiled code if it exists. 719 static void patch_callers_callsite(MacroAssembler *masm) { 720 Label L; 721 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 722 __ jcc(Assembler::equal, L); 723 724 // Save the current stack pointer 725 __ mov(r13, rsp); 726 // Schedule the branch target address early. 727 // Call into the VM to patch the caller, then jump to compiled callee 728 // rax isn't live so capture return address while we easily can 729 __ movptr(rax, Address(rsp, 0)); 730 731 // align stack so push_CPU_state doesn't fault 732 __ andptr(rsp, -(StackAlignmentInBytes)); 733 __ push_CPU_state(); 734 __ vzeroupper(); 735 // VM needs caller's callsite 736 // VM needs target method 737 // This needs to be a long call since we will relocate this adapter to 738 // the codeBuffer and it may not reach 739 740 // Allocate argument register save area 741 if (frame::arg_reg_save_area_bytes != 0) { 742 __ subptr(rsp, frame::arg_reg_save_area_bytes); 743 } 744 __ mov(c_rarg0, rbx); 745 __ mov(c_rarg1, rax); 746 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite))); 747 748 // De-allocate argument register save area 749 if (frame::arg_reg_save_area_bytes != 0) { 750 __ addptr(rsp, frame::arg_reg_save_area_bytes); 751 } 752 753 __ vzeroupper(); 754 __ pop_CPU_state(); 755 // restore sp 756 __ mov(rsp, r13); 757 __ bind(L); 758 } 759 760 // For each inline type argument, sig includes the list of fields of 761 // the inline type. This utility function computes the number of 762 // arguments for the call if inline types are passed by reference (the 763 // calling convention the interpreter expects). 764 static int compute_total_args_passed_int(const GrowableArray<SigEntry>* sig_extended) { 765 int total_args_passed = 0; 766 if (InlineTypePassFieldsAsArgs) { 767 for (int i = 0; i < sig_extended->length(); i++) { 768 BasicType bt = sig_extended->at(i)._bt; 769 if (bt == T_METADATA) { 770 // In sig_extended, an inline type argument starts with: 771 // T_METADATA, followed by the types of the fields of the 772 // inline type and T_VOID to mark the end of the value 773 // type. Inline types are flattened so, for instance, in the 774 // case of an inline type with an int field and an inline type 775 // field that itself has 2 fields, an int and a long: 776 // T_METADATA T_INT T_METADATA T_INT T_LONG T_VOID (second 777 // slot for the T_LONG) T_VOID (inner inline type) T_VOID 778 // (outer inline type) 779 total_args_passed++; 780 int vt = 1; 781 do { 782 i++; 783 BasicType bt = sig_extended->at(i)._bt; 784 BasicType prev_bt = sig_extended->at(i-1)._bt; 785 if (bt == T_METADATA) { 786 vt++; 787 } else if (bt == T_VOID && 788 prev_bt != T_LONG && 789 prev_bt != T_DOUBLE) { 790 vt--; 791 } 792 } while (vt != 0); 793 } else { 794 total_args_passed++; 795 } 796 } 797 } else { 798 total_args_passed = sig_extended->length(); 799 } 800 return total_args_passed; 801 } 802 803 804 static void gen_c2i_adapter_helper(MacroAssembler* masm, 805 BasicType bt, 806 BasicType prev_bt, 807 size_t size_in_bytes, 808 const VMRegPair& reg_pair, 809 const Address& to, 810 int extraspace, 811 bool is_oop) { 812 if (bt == T_VOID) { 813 assert(prev_bt == T_LONG || prev_bt == T_DOUBLE, "missing half"); 814 return; 815 } 816 817 // Say 4 args: 818 // i st_off 819 // 0 32 T_LONG 820 // 1 24 T_VOID 821 // 2 16 T_OBJECT 822 // 3 8 T_BOOL 823 // - 0 return address 824 // 825 // However to make thing extra confusing. Because we can fit a long/double in 826 // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter 827 // leaves one slot empty and only stores to a single slot. In this case the 828 // slot that is occupied is the T_VOID slot. See I said it was confusing. 829 830 bool wide = (size_in_bytes == wordSize); 831 VMReg r_1 = reg_pair.first(); 832 VMReg r_2 = reg_pair.second(); 833 assert(r_2->is_valid() == wide, "invalid size"); 834 if (!r_1->is_valid()) { 835 assert(!r_2->is_valid(), "must be invalid"); 836 return; 837 } 838 839 if (!r_1->is_XMMRegister()) { 840 Register val = rax; 841 if (r_1->is_stack()) { 842 int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace; 843 __ load_sized_value(val, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false); 844 } else { 845 val = r_1->as_Register(); 846 } 847 assert_different_registers(to.base(), val, rscratch1); 848 if (is_oop) { 849 __ push(r13); 850 __ push(rbx); 851 __ store_heap_oop(to, val, rscratch1, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED); 852 __ pop(rbx); 853 __ pop(r13); 854 } else { 855 __ store_sized_value(to, val, size_in_bytes); 856 } 857 } else { 858 if (wide) { 859 __ movdbl(to, r_1->as_XMMRegister()); 860 } else { 861 __ movflt(to, r_1->as_XMMRegister()); 862 } 863 } 864 } 865 866 static void gen_c2i_adapter(MacroAssembler *masm, 867 const GrowableArray<SigEntry>* sig_extended, 868 const VMRegPair *regs, 869 bool requires_clinit_barrier, 870 address& c2i_no_clinit_check_entry, 871 Label& skip_fixup, 872 address start, 873 OopMapSet* oop_maps, 874 int& frame_complete, 875 int& frame_size_in_words, 876 bool alloc_inline_receiver) { 877 if (requires_clinit_barrier && VM_Version::supports_fast_class_init_checks()) { 878 Label L_skip_barrier; 879 Register method = rbx; 880 881 { // Bypass the barrier for non-static methods 882 Register flags = rscratch1; 883 __ load_unsigned_short(flags, Address(method, Method::access_flags_offset())); 884 __ testl(flags, JVM_ACC_STATIC); 885 __ jcc(Assembler::zero, L_skip_barrier); // non-static 886 } 887 888 Register klass = rscratch1; 889 __ load_method_holder(klass, method); 890 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 891 892 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 893 894 __ bind(L_skip_barrier); 895 c2i_no_clinit_check_entry = __ pc(); 896 } 897 898 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 899 bs->c2i_entry_barrier(masm); 900 901 // Before we get into the guts of the C2I adapter, see if we should be here 902 // at all. We've come from compiled code and are attempting to jump to the 903 // interpreter, which means the caller made a static call to get here 904 // (vcalls always get a compiled target if there is one). Check for a 905 // compiled target. If there is one, we need to patch the caller's call. 906 patch_callers_callsite(masm); 907 908 __ bind(skip_fixup); 909 910 if (InlineTypePassFieldsAsArgs) { 911 // Is there an inline type argument? 912 bool has_inline_argument = false; 913 for (int i = 0; i < sig_extended->length() && !has_inline_argument; i++) { 914 has_inline_argument = (sig_extended->at(i)._bt == T_METADATA); 915 } 916 if (has_inline_argument) { 917 // There is at least an inline type argument: we're coming from 918 // compiled code so we have no buffers to back the inline types. 919 // Allocate the buffers here with a runtime call. 920 OopMap* map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ false); 921 922 frame_complete = __ offset(); 923 924 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 925 926 __ mov(c_rarg0, r15_thread); 927 __ mov(c_rarg1, rbx); 928 __ mov64(c_rarg2, (int64_t)alloc_inline_receiver); 929 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::allocate_inline_types))); 930 931 oop_maps->add_gc_map((int)(__ pc() - start), map); 932 __ reset_last_Java_frame(false); 933 934 RegisterSaver::restore_live_registers(masm); 935 936 Label no_exception; 937 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 938 __ jcc(Assembler::equal, no_exception); 939 940 __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD); 941 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 942 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 943 944 __ bind(no_exception); 945 946 // We get an array of objects from the runtime call 947 __ get_vm_result(rscratch2, r15_thread); // Use rscratch2 (r11) as temporary because rscratch1 (r10) is trashed by movptr() 948 __ get_vm_result_2(rbx, r15_thread); // TODO: required to keep the callee Method live? 949 } 950 } 951 952 // Since all args are passed on the stack, total_args_passed * 953 // Interpreter::stackElementSize is the space we need. 954 int total_args_passed = compute_total_args_passed_int(sig_extended); 955 assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed); 956 957 int extraspace = (total_args_passed * Interpreter::stackElementSize); 958 959 // stack is aligned, keep it that way 960 // This is not currently needed or enforced by the interpreter, but 961 // we might as well conform to the ABI. 962 extraspace = align_up(extraspace, 2*wordSize); 963 964 // set senderSP value 965 __ lea(r13, Address(rsp, wordSize)); 966 967 #ifdef ASSERT 968 __ check_stack_alignment(r13, "sender stack not aligned"); 969 #endif 970 if (extraspace > 0) { 971 // Pop the return address 972 __ pop(rax); 973 974 __ subptr(rsp, extraspace); 975 976 // Push the return address 977 __ push(rax); 978 979 // Account for the return address location since we store it first rather 980 // than hold it in a register across all the shuffling 981 extraspace += wordSize; 982 } 983 984 #ifdef ASSERT 985 __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax); 986 #endif 987 988 // Now write the args into the outgoing interpreter space 989 990 // next_arg_comp is the next argument from the compiler point of 991 // view (inline type fields are passed in registers/on the stack). In 992 // sig_extended, an inline type argument starts with: T_METADATA, 993 // followed by the types of the fields of the inline type and T_VOID 994 // to mark the end of the inline type. ignored counts the number of 995 // T_METADATA/T_VOID. next_vt_arg is the next inline type argument: 996 // used to get the buffer for that argument from the pool of buffers 997 // we allocated above and want to pass to the 998 // interpreter. next_arg_int is the next argument from the 999 // interpreter point of view (inline types are passed by reference). 1000 for (int next_arg_comp = 0, ignored = 0, next_vt_arg = 0, next_arg_int = 0; 1001 next_arg_comp < sig_extended->length(); next_arg_comp++) { 1002 assert(ignored <= next_arg_comp, "shouldn't skip over more slots than there are arguments"); 1003 assert(next_arg_int <= total_args_passed, "more arguments for the interpreter than expected?"); 1004 BasicType bt = sig_extended->at(next_arg_comp)._bt; 1005 int st_off = (total_args_passed - next_arg_int) * Interpreter::stackElementSize; 1006 if (!InlineTypePassFieldsAsArgs || bt != T_METADATA) { 1007 int next_off = st_off - Interpreter::stackElementSize; 1008 const int offset = (bt == T_LONG || bt == T_DOUBLE) ? next_off : st_off; 1009 const VMRegPair reg_pair = regs[next_arg_comp-ignored]; 1010 size_t size_in_bytes = reg_pair.second()->is_valid() ? 8 : 4; 1011 gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL, 1012 size_in_bytes, reg_pair, Address(rsp, offset), extraspace, false); 1013 next_arg_int++; 1014 #ifdef ASSERT 1015 if (bt == T_LONG || bt == T_DOUBLE) { 1016 // Overwrite the unused slot with known junk 1017 __ mov64(rax, CONST64(0xdeadffffdeadaaaa)); 1018 __ movptr(Address(rsp, st_off), rax); 1019 } 1020 #endif /* ASSERT */ 1021 } else { 1022 ignored++; 1023 // get the buffer from the just allocated pool of buffers 1024 int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + next_vt_arg * type2aelembytes(T_OBJECT); 1025 __ load_heap_oop(r14, Address(rscratch2, index)); 1026 next_vt_arg++; next_arg_int++; 1027 int vt = 1; 1028 // write fields we get from compiled code in registers/stack 1029 // slots to the buffer: we know we are done with that inline type 1030 // argument when we hit the T_VOID that acts as an end of inline 1031 // type delimiter for this inline type. Inline types are flattened 1032 // so we might encounter embedded inline types. Each entry in 1033 // sig_extended contains a field offset in the buffer. 1034 Label L_null; 1035 do { 1036 next_arg_comp++; 1037 BasicType bt = sig_extended->at(next_arg_comp)._bt; 1038 BasicType prev_bt = sig_extended->at(next_arg_comp-1)._bt; 1039 if (bt == T_METADATA) { 1040 vt++; 1041 ignored++; 1042 } else if (bt == T_VOID && 1043 prev_bt != T_LONG && 1044 prev_bt != T_DOUBLE) { 1045 vt--; 1046 ignored++; 1047 } else { 1048 int off = sig_extended->at(next_arg_comp)._offset; 1049 if (off == -1) { 1050 // Nullable inline type argument, emit null check 1051 VMReg reg = regs[next_arg_comp-ignored].first(); 1052 Label L_notNull; 1053 if (reg->is_stack()) { 1054 int ld_off = reg->reg2stack() * VMRegImpl::stack_slot_size + extraspace; 1055 __ testb(Address(rsp, ld_off), 1); 1056 } else { 1057 __ testb(reg->as_Register(), 1); 1058 } 1059 __ jcc(Assembler::notZero, L_notNull); 1060 __ movptr(Address(rsp, st_off), 0); 1061 __ jmp(L_null); 1062 __ bind(L_notNull); 1063 continue; 1064 } 1065 assert(off > 0, "offset in object should be positive"); 1066 size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize; 1067 bool is_oop = is_reference_type(bt); 1068 gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL, 1069 size_in_bytes, regs[next_arg_comp-ignored], Address(r14, off), extraspace, is_oop); 1070 } 1071 } while (vt != 0); 1072 // pass the buffer to the interpreter 1073 __ movptr(Address(rsp, st_off), r14); 1074 __ bind(L_null); 1075 } 1076 } 1077 1078 // Schedule the branch target address early. 1079 __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset()))); 1080 __ jmp(rcx); 1081 } 1082 1083 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg, 1084 address code_start, address code_end, 1085 Label& L_ok) { 1086 Label L_fail; 1087 __ lea(temp_reg, AddressLiteral(code_start, relocInfo::none)); 1088 __ cmpptr(pc_reg, temp_reg); 1089 __ jcc(Assembler::belowEqual, L_fail); 1090 __ lea(temp_reg, AddressLiteral(code_end, relocInfo::none)); 1091 __ cmpptr(pc_reg, temp_reg); 1092 __ jcc(Assembler::below, L_ok); 1093 __ bind(L_fail); 1094 } 1095 1096 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm, 1097 int comp_args_on_stack, 1098 const GrowableArray<SigEntry>* sig, 1099 const VMRegPair *regs) { 1100 1101 // Note: r13 contains the senderSP on entry. We must preserve it since 1102 // we may do a i2c -> c2i transition if we lose a race where compiled 1103 // code goes non-entrant while we get args ready. 1104 // In addition we use r13 to locate all the interpreter args as 1105 // we must align the stack to 16 bytes on an i2c entry else we 1106 // lose alignment we expect in all compiled code and register 1107 // save code can segv when fxsave instructions find improperly 1108 // aligned stack pointer. 1109 1110 // Adapters can be frameless because they do not require the caller 1111 // to perform additional cleanup work, such as correcting the stack pointer. 1112 // An i2c adapter is frameless because the *caller* frame, which is interpreted, 1113 // routinely repairs its own stack pointer (from interpreter_frame_last_sp), 1114 // even if a callee has modified the stack pointer. 1115 // A c2i adapter is frameless because the *callee* frame, which is interpreted, 1116 // routinely repairs its caller's stack pointer (from sender_sp, which is set 1117 // up via the senderSP register). 1118 // In other words, if *either* the caller or callee is interpreted, we can 1119 // get the stack pointer repaired after a call. 1120 // This is why c2i and i2c adapters cannot be indefinitely composed. 1121 // In particular, if a c2i adapter were to somehow call an i2c adapter, 1122 // both caller and callee would be compiled methods, and neither would 1123 // clean up the stack pointer changes performed by the two adapters. 1124 // If this happens, control eventually transfers back to the compiled 1125 // caller, but with an uncorrected stack, causing delayed havoc. 1126 1127 if (VerifyAdapterCalls && 1128 (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) { 1129 // So, let's test for cascading c2i/i2c adapters right now. 1130 // assert(Interpreter::contains($return_addr) || 1131 // StubRoutines::contains($return_addr), 1132 // "i2c adapter must return to an interpreter frame"); 1133 __ block_comment("verify_i2c { "); 1134 // Pick up the return address 1135 __ movptr(rax, Address(rsp, 0)); 1136 Label L_ok; 1137 if (Interpreter::code() != nullptr) { 1138 range_check(masm, rax, r11, 1139 Interpreter::code()->code_start(), 1140 Interpreter::code()->code_end(), 1141 L_ok); 1142 } 1143 if (StubRoutines::initial_stubs_code() != nullptr) { 1144 range_check(masm, rax, r11, 1145 StubRoutines::initial_stubs_code()->code_begin(), 1146 StubRoutines::initial_stubs_code()->code_end(), 1147 L_ok); 1148 } 1149 if (StubRoutines::final_stubs_code() != nullptr) { 1150 range_check(masm, rax, r11, 1151 StubRoutines::final_stubs_code()->code_begin(), 1152 StubRoutines::final_stubs_code()->code_end(), 1153 L_ok); 1154 } 1155 const char* msg = "i2c adapter must return to an interpreter frame"; 1156 __ block_comment(msg); 1157 __ stop(msg); 1158 __ bind(L_ok); 1159 __ block_comment("} verify_i2ce "); 1160 } 1161 1162 // Must preserve original SP for loading incoming arguments because 1163 // we need to align the outgoing SP for compiled code. 1164 __ movptr(r11, rsp); 1165 1166 // Pick up the return address 1167 __ pop(rax); 1168 1169 // Convert 4-byte c2 stack slots to words. 1170 int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord; 1171 1172 if (comp_args_on_stack) { 1173 __ subptr(rsp, comp_words_on_stack * wordSize); 1174 } 1175 1176 // Ensure compiled code always sees stack at proper alignment 1177 __ andptr(rsp, -16); 1178 1179 // push the return address and misalign the stack that youngest frame always sees 1180 // as far as the placement of the call instruction 1181 __ push(rax); 1182 1183 // Put saved SP in another register 1184 const Register saved_sp = rax; 1185 __ movptr(saved_sp, r11); 1186 1187 // Will jump to the compiled code just as if compiled code was doing it. 1188 // Pre-load the register-jump target early, to schedule it better. 1189 __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_inline_offset()))); 1190 1191 #if INCLUDE_JVMCI 1192 if (EnableJVMCI) { 1193 // check if this call should be routed towards a specific entry point 1194 __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 1195 Label no_alternative_target; 1196 __ jcc(Assembler::equal, no_alternative_target); 1197 __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset()))); 1198 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0); 1199 __ bind(no_alternative_target); 1200 } 1201 #endif // INCLUDE_JVMCI 1202 1203 int total_args_passed = sig->length(); 1204 1205 // Now generate the shuffle code. Pick up all register args and move the 1206 // rest through the floating point stack top. 1207 for (int i = 0; i < total_args_passed; i++) { 1208 BasicType bt = sig->at(i)._bt; 1209 if (bt == T_VOID) { 1210 // Longs and doubles are passed in native word order, but misaligned 1211 // in the 32-bit build. 1212 BasicType prev_bt = (i > 0) ? sig->at(i-1)._bt : T_ILLEGAL; 1213 assert(i > 0 && (prev_bt == T_LONG || prev_bt == T_DOUBLE), "missing half"); 1214 continue; 1215 } 1216 1217 // Pick up 0, 1 or 2 words from SP+offset. 1218 1219 assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), 1220 "scrambled load targets?"); 1221 // Load in argument order going down. 1222 int ld_off = (total_args_passed - i)*Interpreter::stackElementSize; 1223 // Point to interpreter value (vs. tag) 1224 int next_off = ld_off - Interpreter::stackElementSize; 1225 // 1226 // 1227 // 1228 VMReg r_1 = regs[i].first(); 1229 VMReg r_2 = regs[i].second(); 1230 if (!r_1->is_valid()) { 1231 assert(!r_2->is_valid(), ""); 1232 continue; 1233 } 1234 if (r_1->is_stack()) { 1235 // Convert stack slot to an SP offset (+ wordSize to account for return address ) 1236 int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize; 1237 1238 // We can use r13 as a temp here because compiled code doesn't need r13 as an input 1239 // and if we end up going thru a c2i because of a miss a reasonable value of r13 1240 // will be generated. 1241 if (!r_2->is_valid()) { 1242 // sign extend??? 1243 __ movl(r13, Address(saved_sp, ld_off)); 1244 __ movptr(Address(rsp, st_off), r13); 1245 } else { 1246 // 1247 // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 1248 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 1249 // So we must adjust where to pick up the data to match the interpreter. 1250 // 1251 // Interpreter local[n] == MSW, local[n+1] == LSW however locals 1252 // are accessed as negative so LSW is at LOW address 1253 1254 // ld_off is MSW so get LSW 1255 const int offset = (bt==T_LONG||bt==T_DOUBLE)? 1256 next_off : ld_off; 1257 __ movq(r13, Address(saved_sp, offset)); 1258 // st_off is LSW (i.e. reg.first()) 1259 __ movq(Address(rsp, st_off), r13); 1260 } 1261 } else if (r_1->is_Register()) { // Register argument 1262 Register r = r_1->as_Register(); 1263 assert(r != rax, "must be different"); 1264 if (r_2->is_valid()) { 1265 // 1266 // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE 1267 // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case 1268 // So we must adjust where to pick up the data to match the interpreter. 1269 1270 const int offset = (bt==T_LONG||bt==T_DOUBLE)? 1271 next_off : ld_off; 1272 1273 // this can be a misaligned move 1274 __ movq(r, Address(saved_sp, offset)); 1275 } else { 1276 // sign extend and use a full word? 1277 __ movl(r, Address(saved_sp, ld_off)); 1278 } 1279 } else { 1280 if (!r_2->is_valid()) { 1281 __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off)); 1282 } else { 1283 __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off)); 1284 } 1285 } 1286 } 1287 1288 __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about 1289 1290 // 6243940 We might end up in handle_wrong_method if 1291 // the callee is deoptimized as we race thru here. If that 1292 // happens we don't want to take a safepoint because the 1293 // caller frame will look interpreted and arguments are now 1294 // "compiled" so it is much better to make this transition 1295 // invisible to the stack walking code. Unfortunately if 1296 // we try and find the callee by normal means a safepoint 1297 // is possible. So we stash the desired callee in the thread 1298 // and the vm will find there should this case occur. 1299 1300 __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx); 1301 1302 // put Method* where a c2i would expect should we end up there 1303 // only needed because of c2 resolve stubs return Method* as a result in 1304 // rax 1305 __ mov(rax, rbx); 1306 __ jmp(r11); 1307 } 1308 1309 static void gen_inline_cache_check(MacroAssembler *masm, Label& skip_fixup) { 1310 Register data = rax; 1311 __ ic_check(1 /* end_alignment */); 1312 __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset())); 1313 1314 // Method might have been compiled since the call site was patched to 1315 // interpreted if that is the case treat it as a miss so we can get 1316 // the call site corrected. 1317 __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD); 1318 __ jcc(Assembler::equal, skip_fixup); 1319 __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1320 } 1321 1322 // --------------------------------------------------------------- 1323 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler* masm, 1324 int comp_args_on_stack, 1325 const GrowableArray<SigEntry>* sig, 1326 const VMRegPair* regs, 1327 const GrowableArray<SigEntry>* sig_cc, 1328 const VMRegPair* regs_cc, 1329 const GrowableArray<SigEntry>* sig_cc_ro, 1330 const VMRegPair* regs_cc_ro, 1331 AdapterFingerPrint* fingerprint, 1332 AdapterBlob*& new_adapter, 1333 bool allocate_code_blob) { 1334 address i2c_entry = __ pc(); 1335 gen_i2c_adapter(masm, comp_args_on_stack, sig, regs); 1336 1337 // ------------------------------------------------------------------------- 1338 // Generate a C2I adapter. On entry we know rbx holds the Method* during calls 1339 // to the interpreter. The args start out packed in the compiled layout. They 1340 // need to be unpacked into the interpreter layout. This will almost always 1341 // require some stack space. We grow the current (compiled) stack, then repack 1342 // the args. We finally end in a jump to the generic interpreter entry point. 1343 // On exit from the interpreter, the interpreter will restore our SP (lest the 1344 // compiled code, which relies solely on SP and not RBP, get sick). 1345 1346 address c2i_unverified_entry = __ pc(); 1347 address c2i_unverified_inline_entry = __ pc(); 1348 Label skip_fixup; 1349 1350 gen_inline_cache_check(masm, skip_fixup); 1351 1352 OopMapSet* oop_maps = new OopMapSet(); 1353 int frame_complete = CodeOffsets::frame_never_safe; 1354 int frame_size_in_words = 0; 1355 1356 // Scalarized c2i adapter with non-scalarized receiver (i.e., don't pack receiver) 1357 address c2i_no_clinit_check_entry = nullptr; 1358 address c2i_inline_ro_entry = __ pc(); 1359 if (regs_cc != regs_cc_ro) { 1360 // No class init barrier needed because method is guaranteed to be non-static 1361 gen_c2i_adapter(masm, sig_cc_ro, regs_cc_ro, /* requires_clinit_barrier = */ false, c2i_no_clinit_check_entry, 1362 skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false); 1363 skip_fixup.reset(); 1364 } 1365 1366 // Scalarized c2i adapter 1367 address c2i_entry = __ pc(); 1368 address c2i_inline_entry = __ pc(); 1369 gen_c2i_adapter(masm, sig_cc, regs_cc, /* requires_clinit_barrier = */ true, c2i_no_clinit_check_entry, 1370 skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ true); 1371 1372 // Non-scalarized c2i adapter 1373 if (regs != regs_cc) { 1374 c2i_unverified_inline_entry = __ pc(); 1375 Label inline_entry_skip_fixup; 1376 gen_inline_cache_check(masm, inline_entry_skip_fixup); 1377 1378 c2i_inline_entry = __ pc(); 1379 gen_c2i_adapter(masm, sig, regs, /* requires_clinit_barrier = */ true, c2i_no_clinit_check_entry, 1380 inline_entry_skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false); 1381 } 1382 1383 // The c2i adapters might safepoint and trigger a GC. The caller must make sure that 1384 // the GC knows about the location of oop argument locations passed to the c2i adapter. 1385 if (allocate_code_blob) { 1386 bool caller_must_gc_arguments = (regs != regs_cc); 1387 new_adapter = AdapterBlob::create(masm->code(), frame_complete, frame_size_in_words, oop_maps, caller_must_gc_arguments); 1388 } 1389 1390 return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_inline_entry, c2i_inline_ro_entry, c2i_unverified_entry, c2i_unverified_inline_entry, c2i_no_clinit_check_entry); 1391 } 1392 1393 int SharedRuntime::c_calling_convention(const BasicType *sig_bt, 1394 VMRegPair *regs, 1395 int total_args_passed) { 1396 1397 // We return the amount of VMRegImpl stack slots we need to reserve for all 1398 // the arguments NOT counting out_preserve_stack_slots. 1399 1400 // NOTE: These arrays will have to change when c1 is ported 1401 #ifdef _WIN64 1402 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1403 c_rarg0, c_rarg1, c_rarg2, c_rarg3 1404 }; 1405 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1406 c_farg0, c_farg1, c_farg2, c_farg3 1407 }; 1408 #else 1409 static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = { 1410 c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5 1411 }; 1412 static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = { 1413 c_farg0, c_farg1, c_farg2, c_farg3, 1414 c_farg4, c_farg5, c_farg6, c_farg7 1415 }; 1416 #endif // _WIN64 1417 1418 1419 uint int_args = 0; 1420 uint fp_args = 0; 1421 uint stk_args = 0; // inc by 2 each time 1422 1423 for (int i = 0; i < total_args_passed; i++) { 1424 switch (sig_bt[i]) { 1425 case T_BOOLEAN: 1426 case T_CHAR: 1427 case T_BYTE: 1428 case T_SHORT: 1429 case T_INT: 1430 if (int_args < Argument::n_int_register_parameters_c) { 1431 regs[i].set1(INT_ArgReg[int_args++]->as_VMReg()); 1432 #ifdef _WIN64 1433 fp_args++; 1434 // Allocate slots for callee to stuff register args the stack. 1435 stk_args += 2; 1436 #endif 1437 } else { 1438 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1439 stk_args += 2; 1440 } 1441 break; 1442 case T_LONG: 1443 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1444 // fall through 1445 case T_OBJECT: 1446 case T_ARRAY: 1447 case T_ADDRESS: 1448 case T_METADATA: 1449 if (int_args < Argument::n_int_register_parameters_c) { 1450 regs[i].set2(INT_ArgReg[int_args++]->as_VMReg()); 1451 #ifdef _WIN64 1452 fp_args++; 1453 stk_args += 2; 1454 #endif 1455 } else { 1456 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1457 stk_args += 2; 1458 } 1459 break; 1460 case T_FLOAT: 1461 if (fp_args < Argument::n_float_register_parameters_c) { 1462 regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg()); 1463 #ifdef _WIN64 1464 int_args++; 1465 // Allocate slots for callee to stuff register args the stack. 1466 stk_args += 2; 1467 #endif 1468 } else { 1469 regs[i].set1(VMRegImpl::stack2reg(stk_args)); 1470 stk_args += 2; 1471 } 1472 break; 1473 case T_DOUBLE: 1474 assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half"); 1475 if (fp_args < Argument::n_float_register_parameters_c) { 1476 regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg()); 1477 #ifdef _WIN64 1478 int_args++; 1479 // Allocate slots for callee to stuff register args the stack. 1480 stk_args += 2; 1481 #endif 1482 } else { 1483 regs[i].set2(VMRegImpl::stack2reg(stk_args)); 1484 stk_args += 2; 1485 } 1486 break; 1487 case T_VOID: // Halves of longs and doubles 1488 assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half"); 1489 regs[i].set_bad(); 1490 break; 1491 default: 1492 ShouldNotReachHere(); 1493 break; 1494 } 1495 } 1496 #ifdef _WIN64 1497 // windows abi requires that we always allocate enough stack space 1498 // for 4 64bit registers to be stored down. 1499 if (stk_args < 8) { 1500 stk_args = 8; 1501 } 1502 #endif // _WIN64 1503 1504 return stk_args; 1505 } 1506 1507 int SharedRuntime::vector_calling_convention(VMRegPair *regs, 1508 uint num_bits, 1509 uint total_args_passed) { 1510 assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512, 1511 "only certain vector sizes are supported for now"); 1512 1513 static const XMMRegister VEC_ArgReg[32] = { 1514 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 1515 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1516 xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23, 1517 xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31 1518 }; 1519 1520 uint stk_args = 0; 1521 uint fp_args = 0; 1522 1523 for (uint i = 0; i < total_args_passed; i++) { 1524 VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg(); 1525 int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits == 256 ? 7 : 15)); 1526 regs[i].set_pair(vmreg->next(next_val), vmreg); 1527 } 1528 1529 return stk_args; 1530 } 1531 1532 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1533 // We always ignore the frame_slots arg and just use the space just below frame pointer 1534 // which by this time is free to use 1535 switch (ret_type) { 1536 case T_FLOAT: 1537 __ movflt(Address(rbp, -wordSize), xmm0); 1538 break; 1539 case T_DOUBLE: 1540 __ movdbl(Address(rbp, -wordSize), xmm0); 1541 break; 1542 case T_VOID: break; 1543 default: { 1544 __ movptr(Address(rbp, -wordSize), rax); 1545 } 1546 } 1547 } 1548 1549 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) { 1550 // We always ignore the frame_slots arg and just use the space just below frame pointer 1551 // which by this time is free to use 1552 switch (ret_type) { 1553 case T_FLOAT: 1554 __ movflt(xmm0, Address(rbp, -wordSize)); 1555 break; 1556 case T_DOUBLE: 1557 __ movdbl(xmm0, Address(rbp, -wordSize)); 1558 break; 1559 case T_VOID: break; 1560 default: { 1561 __ movptr(rax, Address(rbp, -wordSize)); 1562 } 1563 } 1564 } 1565 1566 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1567 for ( int i = first_arg ; i < arg_count ; i++ ) { 1568 if (args[i].first()->is_Register()) { 1569 __ push(args[i].first()->as_Register()); 1570 } else if (args[i].first()->is_XMMRegister()) { 1571 __ subptr(rsp, 2*wordSize); 1572 __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister()); 1573 } 1574 } 1575 } 1576 1577 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) { 1578 for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) { 1579 if (args[i].first()->is_Register()) { 1580 __ pop(args[i].first()->as_Register()); 1581 } else if (args[i].first()->is_XMMRegister()) { 1582 __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0)); 1583 __ addptr(rsp, 2*wordSize); 1584 } 1585 } 1586 } 1587 1588 static void verify_oop_args(MacroAssembler* masm, 1589 const methodHandle& method, 1590 const BasicType* sig_bt, 1591 const VMRegPair* regs) { 1592 Register temp_reg = rbx; // not part of any compiled calling seq 1593 if (VerifyOops) { 1594 for (int i = 0; i < method->size_of_parameters(); i++) { 1595 if (is_reference_type(sig_bt[i])) { 1596 VMReg r = regs[i].first(); 1597 assert(r->is_valid(), "bad oop arg"); 1598 if (r->is_stack()) { 1599 __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 1600 __ verify_oop(temp_reg); 1601 } else { 1602 __ verify_oop(r->as_Register()); 1603 } 1604 } 1605 } 1606 } 1607 } 1608 1609 static void check_continuation_enter_argument(VMReg actual_vmreg, 1610 Register expected_reg, 1611 const char* name) { 1612 assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name); 1613 assert(actual_vmreg->as_Register() == expected_reg, 1614 "%s is in unexpected register: %s instead of %s", 1615 name, actual_vmreg->as_Register()->name(), expected_reg->name()); 1616 } 1617 1618 1619 //---------------------------- continuation_enter_setup --------------------------- 1620 // 1621 // Arguments: 1622 // None. 1623 // 1624 // Results: 1625 // rsp: pointer to blank ContinuationEntry 1626 // 1627 // Kills: 1628 // rax 1629 // 1630 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) { 1631 assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, ""); 1632 assert(in_bytes(ContinuationEntry::cont_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1633 assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, ""); 1634 1635 stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize; 1636 __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1637 1638 int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size; 1639 OopMap* map = new OopMap(frame_size, 0); 1640 1641 __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset())); 1642 __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax); 1643 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp); 1644 1645 return map; 1646 } 1647 1648 //---------------------------- fill_continuation_entry --------------------------- 1649 // 1650 // Arguments: 1651 // rsp: pointer to blank Continuation entry 1652 // reg_cont_obj: pointer to the continuation 1653 // reg_flags: flags 1654 // 1655 // Results: 1656 // rsp: pointer to filled out ContinuationEntry 1657 // 1658 // Kills: 1659 // rax 1660 // 1661 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) { 1662 assert_different_registers(rax, reg_cont_obj, reg_flags); 1663 #ifdef ASSERT 1664 __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value()); 1665 #endif 1666 __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj); 1667 __ movl (Address(rsp, ContinuationEntry::flags_offset()), reg_flags); 1668 __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0); 1669 __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0); 1670 __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0); 1671 1672 __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset())); 1673 __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax); 1674 __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset())); 1675 __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax); 1676 1677 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0); 1678 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0); 1679 } 1680 1681 //---------------------------- continuation_enter_cleanup --------------------------- 1682 // 1683 // Arguments: 1684 // rsp: pointer to the ContinuationEntry 1685 // 1686 // Results: 1687 // rsp: pointer to the spilled rbp in the entry frame 1688 // 1689 // Kills: 1690 // rbx 1691 // 1692 static void continuation_enter_cleanup(MacroAssembler* masm) { 1693 #ifdef ASSERT 1694 Label L_good_sp; 1695 __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1696 __ jcc(Assembler::equal, L_good_sp); 1697 __ stop("Incorrect rsp at continuation_enter_cleanup"); 1698 __ bind(L_good_sp); 1699 #endif 1700 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset())); 1701 __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx); 1702 1703 if (CheckJNICalls) { 1704 // Check if this is a virtual thread continuation 1705 Label L_skip_vthread_code; 1706 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0); 1707 __ jcc(Assembler::equal, L_skip_vthread_code); 1708 1709 // If the held monitor count is > 0 and this vthread is terminating then 1710 // it failed to release a JNI monitor. So we issue the same log message 1711 // that JavaThread::exit does. 1712 __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1713 __ jcc(Assembler::equal, L_skip_vthread_code); 1714 1715 // rax may hold an exception oop, save it before the call 1716 __ push(rax); 1717 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held)); 1718 __ pop(rax); 1719 1720 // For vthreads we have to explicitly zero the JNI monitor count of the carrier 1721 // on termination. The held count is implicitly zeroed below when we restore from 1722 // the parent held count (which has to be zero). 1723 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1724 1725 __ bind(L_skip_vthread_code); 1726 } 1727 #ifdef ASSERT 1728 else { 1729 // Check if this is a virtual thread continuation 1730 Label L_skip_vthread_code; 1731 __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0); 1732 __ jcc(Assembler::equal, L_skip_vthread_code); 1733 1734 // See comment just above. If not checking JNI calls the JNI count is only 1735 // needed for assertion checking. 1736 __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0); 1737 1738 __ bind(L_skip_vthread_code); 1739 } 1740 #endif 1741 1742 __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset())); 1743 __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx); 1744 1745 __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset())); 1746 __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx); 1747 __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size())); 1748 } 1749 1750 static void gen_continuation_enter(MacroAssembler* masm, 1751 const VMRegPair* regs, 1752 int& exception_offset, 1753 OopMapSet* oop_maps, 1754 int& frame_complete, 1755 int& stack_slots, 1756 int& interpreted_entry_offset, 1757 int& compiled_entry_offset) { 1758 1759 // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread) 1760 int pos_cont_obj = 0; 1761 int pos_is_cont = 1; 1762 int pos_is_virtual = 2; 1763 1764 // The platform-specific calling convention may present the arguments in various registers. 1765 // To simplify the rest of the code, we expect the arguments to reside at these known 1766 // registers, and we additionally check the placement here in case calling convention ever 1767 // changes. 1768 Register reg_cont_obj = c_rarg1; 1769 Register reg_is_cont = c_rarg2; 1770 Register reg_is_virtual = c_rarg3; 1771 1772 check_continuation_enter_argument(regs[pos_cont_obj].first(), reg_cont_obj, "Continuation object"); 1773 check_continuation_enter_argument(regs[pos_is_cont].first(), reg_is_cont, "isContinue"); 1774 check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread"); 1775 1776 // Utility methods kill rax, make sure there are no collisions 1777 assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual); 1778 1779 AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(), 1780 relocInfo::static_call_type); 1781 1782 address start = __ pc(); 1783 1784 Label L_thaw, L_exit; 1785 1786 // i2i entry used at interp_only_mode only 1787 interpreted_entry_offset = __ pc() - start; 1788 { 1789 #ifdef ASSERT 1790 Label is_interp_only; 1791 __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0); 1792 __ jcc(Assembler::notEqual, is_interp_only); 1793 __ stop("enterSpecial interpreter entry called when not in interp_only_mode"); 1794 __ bind(is_interp_only); 1795 #endif 1796 1797 __ pop(rax); // return address 1798 // Read interpreter arguments into registers (this is an ad-hoc i2c adapter) 1799 __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2)); 1800 __ movl(c_rarg2, Address(rsp, Interpreter::stackElementSize*1)); 1801 __ movl(c_rarg3, Address(rsp, Interpreter::stackElementSize*0)); 1802 __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment 1803 __ push(rax); // return address 1804 __ push_cont_fastpath(); 1805 1806 __ enter(); 1807 1808 stack_slots = 2; // will be adjusted in setup 1809 OopMap* map = continuation_enter_setup(masm, stack_slots); 1810 // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe, 1811 // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway. 1812 1813 __ verify_oop(reg_cont_obj); 1814 1815 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1816 1817 // If continuation, call to thaw. Otherwise, resolve the call and exit. 1818 __ testptr(reg_is_cont, reg_is_cont); 1819 __ jcc(Assembler::notZero, L_thaw); 1820 1821 // --- Resolve path 1822 1823 // Make sure the call is patchable 1824 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1825 // Emit stub for static call 1826 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc()); 1827 if (stub == nullptr) { 1828 fatal("CodeCache is full at gen_continuation_enter"); 1829 } 1830 __ call(resolve); 1831 oop_maps->add_gc_map(__ pc() - start, map); 1832 __ post_call_nop(); 1833 1834 __ jmp(L_exit); 1835 } 1836 1837 // compiled entry 1838 __ align(CodeEntryAlignment); 1839 compiled_entry_offset = __ pc() - start; 1840 __ enter(); 1841 1842 stack_slots = 2; // will be adjusted in setup 1843 OopMap* map = continuation_enter_setup(masm, stack_slots); 1844 1845 // Frame is now completed as far as size and linkage. 1846 frame_complete = __ pc() - start; 1847 1848 __ verify_oop(reg_cont_obj); 1849 1850 fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual); 1851 1852 // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue) 1853 __ testptr(reg_is_cont, reg_is_cont); 1854 __ jccb(Assembler::notZero, L_thaw); 1855 1856 // --- call Continuation.enter(Continuation c, boolean isContinue) 1857 1858 // Make sure the call is patchable 1859 __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset); 1860 1861 // Emit stub for static call 1862 address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc()); 1863 if (stub == nullptr) { 1864 fatal("CodeCache is full at gen_continuation_enter"); 1865 } 1866 1867 // The call needs to be resolved. There's a special case for this in 1868 // SharedRuntime::find_callee_info_helper() which calls 1869 // LinkResolver::resolve_continuation_enter() which resolves the call to 1870 // Continuation.enter(Continuation c, boolean isContinue). 1871 __ call(resolve); 1872 1873 oop_maps->add_gc_map(__ pc() - start, map); 1874 __ post_call_nop(); 1875 1876 __ jmpb(L_exit); 1877 1878 // --- Thawing path 1879 1880 __ bind(L_thaw); 1881 1882 ContinuationEntry::_thaw_call_pc_offset = __ pc() - start; 1883 __ call(RuntimeAddress(StubRoutines::cont_thaw())); 1884 1885 ContinuationEntry::_return_pc_offset = __ pc() - start; 1886 oop_maps->add_gc_map(__ pc() - start, map->deep_copy()); 1887 __ post_call_nop(); 1888 1889 // --- Normal exit (resolve/thawing) 1890 1891 __ bind(L_exit); 1892 ContinuationEntry::_cleanup_offset = __ pc() - start; 1893 continuation_enter_cleanup(masm); 1894 __ pop(rbp); 1895 __ ret(0); 1896 1897 // --- Exception handling path 1898 1899 exception_offset = __ pc() - start; 1900 1901 continuation_enter_cleanup(masm); 1902 __ pop(rbp); 1903 1904 __ movptr(c_rarg0, r15_thread); 1905 __ movptr(c_rarg1, Address(rsp, 0)); // return address 1906 1907 // rax still holds the original exception oop, save it before the call 1908 __ push(rax); 1909 1910 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2); 1911 __ movptr(rbx, rax); 1912 1913 // Continue at exception handler: 1914 // rax: exception oop 1915 // rbx: exception handler 1916 // rdx: exception pc 1917 __ pop(rax); 1918 __ verify_oop(rax); 1919 __ pop(rdx); 1920 __ jmp(rbx); 1921 } 1922 1923 static void gen_continuation_yield(MacroAssembler* masm, 1924 const VMRegPair* regs, 1925 OopMapSet* oop_maps, 1926 int& frame_complete, 1927 int& stack_slots, 1928 int& compiled_entry_offset) { 1929 enum layout { 1930 rbp_off, 1931 rbpH_off, 1932 return_off, 1933 return_off2, 1934 framesize // inclusive of return address 1935 }; 1936 stack_slots = framesize / VMRegImpl::slots_per_word; 1937 assert(stack_slots == 2, "recheck layout"); 1938 1939 address start = __ pc(); 1940 compiled_entry_offset = __ pc() - start; 1941 __ enter(); 1942 address the_pc = __ pc(); 1943 1944 frame_complete = the_pc - start; 1945 1946 // This nop must be exactly at the PC we push into the frame info. 1947 // We use this nop for fast CodeBlob lookup, associate the OopMap 1948 // with it right away. 1949 __ post_call_nop(); 1950 OopMap* map = new OopMap(framesize, 1); 1951 oop_maps->add_gc_map(frame_complete, map); 1952 1953 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 1954 __ movptr(c_rarg0, r15_thread); 1955 __ movptr(c_rarg1, rsp); 1956 __ call_VM_leaf(Continuation::freeze_entry(), 2); 1957 __ reset_last_Java_frame(true); 1958 1959 Label L_pinned; 1960 1961 __ testptr(rax, rax); 1962 __ jcc(Assembler::notZero, L_pinned); 1963 1964 __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())); 1965 continuation_enter_cleanup(masm); 1966 __ pop(rbp); 1967 __ ret(0); 1968 1969 __ bind(L_pinned); 1970 1971 // Pinned, return to caller 1972 1973 // handle pending exception thrown by freeze 1974 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 1975 Label ok; 1976 __ jcc(Assembler::equal, ok); 1977 __ leave(); 1978 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 1979 __ bind(ok); 1980 1981 __ leave(); 1982 __ ret(0); 1983 } 1984 1985 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) { 1986 ::continuation_enter_cleanup(masm); 1987 } 1988 1989 static void gen_special_dispatch(MacroAssembler* masm, 1990 const methodHandle& method, 1991 const BasicType* sig_bt, 1992 const VMRegPair* regs) { 1993 verify_oop_args(masm, method, sig_bt, regs); 1994 vmIntrinsics::ID iid = method->intrinsic_id(); 1995 1996 // Now write the args into the outgoing interpreter space 1997 bool has_receiver = false; 1998 Register receiver_reg = noreg; 1999 int member_arg_pos = -1; 2000 Register member_reg = noreg; 2001 int ref_kind = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid); 2002 if (ref_kind != 0) { 2003 member_arg_pos = method->size_of_parameters() - 1; // trailing MemberName argument 2004 member_reg = rbx; // known to be free at this point 2005 has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind); 2006 } else if (iid == vmIntrinsics::_invokeBasic) { 2007 has_receiver = true; 2008 } else if (iid == vmIntrinsics::_linkToNative) { 2009 member_arg_pos = method->size_of_parameters() - 1; // trailing NativeEntryPoint argument 2010 member_reg = rbx; // known to be free at this point 2011 } else { 2012 fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid)); 2013 } 2014 2015 if (member_reg != noreg) { 2016 // Load the member_arg into register, if necessary. 2017 SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs); 2018 VMReg r = regs[member_arg_pos].first(); 2019 if (r->is_stack()) { 2020 __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 2021 } else { 2022 // no data motion is needed 2023 member_reg = r->as_Register(); 2024 } 2025 } 2026 2027 if (has_receiver) { 2028 // Make sure the receiver is loaded into a register. 2029 assert(method->size_of_parameters() > 0, "oob"); 2030 assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object"); 2031 VMReg r = regs[0].first(); 2032 assert(r->is_valid(), "bad receiver arg"); 2033 if (r->is_stack()) { 2034 // Porting note: This assumes that compiled calling conventions always 2035 // pass the receiver oop in a register. If this is not true on some 2036 // platform, pick a temp and load the receiver from stack. 2037 fatal("receiver always in a register"); 2038 receiver_reg = j_rarg0; // known to be free at this point 2039 __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize)); 2040 } else { 2041 // no data motion is needed 2042 receiver_reg = r->as_Register(); 2043 } 2044 } 2045 2046 // Figure out which address we are really jumping to: 2047 MethodHandles::generate_method_handle_dispatch(masm, iid, 2048 receiver_reg, member_reg, /*for_compiler_entry:*/ true); 2049 } 2050 2051 // --------------------------------------------------------------------------- 2052 // Generate a native wrapper for a given method. The method takes arguments 2053 // in the Java compiled code convention, marshals them to the native 2054 // convention (handlizes oops, etc), transitions to native, makes the call, 2055 // returns to java state (possibly blocking), unhandlizes any result and 2056 // returns. 2057 // 2058 // Critical native functions are a shorthand for the use of 2059 // GetPrimtiveArrayCritical and disallow the use of any other JNI 2060 // functions. The wrapper is expected to unpack the arguments before 2061 // passing them to the callee. Critical native functions leave the state _in_Java, 2062 // since they cannot stop for GC. 2063 // Some other parts of JNI setup are skipped like the tear down of the JNI handle 2064 // block and the check for pending exceptions it's impossible for them 2065 // to be thrown. 2066 // 2067 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm, 2068 const methodHandle& method, 2069 int compile_id, 2070 BasicType* in_sig_bt, 2071 VMRegPair* in_regs, 2072 BasicType ret_type) { 2073 if (method->is_continuation_native_intrinsic()) { 2074 int exception_offset = -1; 2075 OopMapSet* oop_maps = new OopMapSet(); 2076 int frame_complete = -1; 2077 int stack_slots = -1; 2078 int interpreted_entry_offset = -1; 2079 int vep_offset = -1; 2080 if (method->is_continuation_enter_intrinsic()) { 2081 gen_continuation_enter(masm, 2082 in_regs, 2083 exception_offset, 2084 oop_maps, 2085 frame_complete, 2086 stack_slots, 2087 interpreted_entry_offset, 2088 vep_offset); 2089 } else if (method->is_continuation_yield_intrinsic()) { 2090 gen_continuation_yield(masm, 2091 in_regs, 2092 oop_maps, 2093 frame_complete, 2094 stack_slots, 2095 vep_offset); 2096 } else { 2097 guarantee(false, "Unknown Continuation native intrinsic"); 2098 } 2099 2100 #ifdef ASSERT 2101 if (method->is_continuation_enter_intrinsic()) { 2102 assert(interpreted_entry_offset != -1, "Must be set"); 2103 assert(exception_offset != -1, "Must be set"); 2104 } else { 2105 assert(interpreted_entry_offset == -1, "Must be unset"); 2106 assert(exception_offset == -1, "Must be unset"); 2107 } 2108 assert(frame_complete != -1, "Must be set"); 2109 assert(stack_slots != -1, "Must be set"); 2110 assert(vep_offset != -1, "Must be set"); 2111 #endif 2112 2113 __ flush(); 2114 nmethod* nm = nmethod::new_native_nmethod(method, 2115 compile_id, 2116 masm->code(), 2117 vep_offset, 2118 frame_complete, 2119 stack_slots, 2120 in_ByteSize(-1), 2121 in_ByteSize(-1), 2122 oop_maps, 2123 exception_offset); 2124 if (nm == nullptr) return nm; 2125 if (method->is_continuation_enter_intrinsic()) { 2126 ContinuationEntry::set_enter_code(nm, interpreted_entry_offset); 2127 } else if (method->is_continuation_yield_intrinsic()) { 2128 _cont_doYield_stub = nm; 2129 } 2130 return nm; 2131 } 2132 2133 if (method->is_method_handle_intrinsic()) { 2134 vmIntrinsics::ID iid = method->intrinsic_id(); 2135 intptr_t start = (intptr_t)__ pc(); 2136 int vep_offset = ((intptr_t)__ pc()) - start; 2137 gen_special_dispatch(masm, 2138 method, 2139 in_sig_bt, 2140 in_regs); 2141 int frame_complete = ((intptr_t)__ pc()) - start; // not complete, period 2142 __ flush(); 2143 int stack_slots = SharedRuntime::out_preserve_stack_slots(); // no out slots at all, actually 2144 return nmethod::new_native_nmethod(method, 2145 compile_id, 2146 masm->code(), 2147 vep_offset, 2148 frame_complete, 2149 stack_slots / VMRegImpl::slots_per_word, 2150 in_ByteSize(-1), 2151 in_ByteSize(-1), 2152 nullptr); 2153 } 2154 address native_func = method->native_function(); 2155 assert(native_func != nullptr, "must have function"); 2156 2157 // An OopMap for lock (and class if static) 2158 OopMapSet *oop_maps = new OopMapSet(); 2159 intptr_t start = (intptr_t)__ pc(); 2160 2161 // We have received a description of where all the java arg are located 2162 // on entry to the wrapper. We need to convert these args to where 2163 // the jni function will expect them. To figure out where they go 2164 // we convert the java signature to a C signature by inserting 2165 // the hidden arguments as arg[0] and possibly arg[1] (static method) 2166 2167 const int total_in_args = method->size_of_parameters(); 2168 int total_c_args = total_in_args + (method->is_static() ? 2 : 1); 2169 2170 BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args); 2171 VMRegPair* out_regs = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args); 2172 2173 int argc = 0; 2174 out_sig_bt[argc++] = T_ADDRESS; 2175 if (method->is_static()) { 2176 out_sig_bt[argc++] = T_OBJECT; 2177 } 2178 2179 for (int i = 0; i < total_in_args ; i++ ) { 2180 out_sig_bt[argc++] = in_sig_bt[i]; 2181 } 2182 2183 // Now figure out where the args must be stored and how much stack space 2184 // they require. 2185 int out_arg_slots; 2186 out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args); 2187 2188 // Compute framesize for the wrapper. We need to handlize all oops in 2189 // incoming registers 2190 2191 // Calculate the total number of stack slots we will need. 2192 2193 // First count the abi requirement plus all of the outgoing args 2194 int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots; 2195 2196 // Now the space for the inbound oop handle area 2197 int total_save_slots = 6 * VMRegImpl::slots_per_word; // 6 arguments passed in registers 2198 2199 int oop_handle_offset = stack_slots; 2200 stack_slots += total_save_slots; 2201 2202 // Now any space we need for handlizing a klass if static method 2203 2204 int klass_slot_offset = 0; 2205 int klass_offset = -1; 2206 int lock_slot_offset = 0; 2207 bool is_static = false; 2208 2209 if (method->is_static()) { 2210 klass_slot_offset = stack_slots; 2211 stack_slots += VMRegImpl::slots_per_word; 2212 klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size; 2213 is_static = true; 2214 } 2215 2216 // Plus a lock if needed 2217 2218 if (method->is_synchronized()) { 2219 lock_slot_offset = stack_slots; 2220 stack_slots += VMRegImpl::slots_per_word; 2221 } 2222 2223 // Now a place (+2) to save return values or temp during shuffling 2224 // + 4 for return address (which we own) and saved rbp 2225 stack_slots += 6; 2226 2227 // Ok The space we have allocated will look like: 2228 // 2229 // 2230 // FP-> | | 2231 // |---------------------| 2232 // | 2 slots for moves | 2233 // |---------------------| 2234 // | lock box (if sync) | 2235 // |---------------------| <- lock_slot_offset 2236 // | klass (if static) | 2237 // |---------------------| <- klass_slot_offset 2238 // | oopHandle area | 2239 // |---------------------| <- oop_handle_offset (6 java arg registers) 2240 // | outbound memory | 2241 // | based arguments | 2242 // | | 2243 // |---------------------| 2244 // | | 2245 // SP-> | out_preserved_slots | 2246 // 2247 // 2248 2249 2250 // Now compute actual number of stack words we need rounding to make 2251 // stack properly aligned. 2252 stack_slots = align_up(stack_slots, StackAlignmentInSlots); 2253 2254 int stack_size = stack_slots * VMRegImpl::stack_slot_size; 2255 2256 // First thing make an ic check to see if we should even be here 2257 2258 // We are free to use all registers as temps without saving them and 2259 // restoring them except rbp. rbp is the only callee save register 2260 // as far as the interpreter and the compiler(s) are concerned. 2261 2262 const Register receiver = j_rarg0; 2263 2264 Label exception_pending; 2265 2266 assert_different_registers(receiver, rscratch1, rscratch2); 2267 __ verify_oop(receiver); 2268 __ ic_check(8 /* end_alignment */); 2269 2270 int vep_offset = ((intptr_t)__ pc()) - start; 2271 2272 if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) { 2273 Label L_skip_barrier; 2274 Register klass = r10; 2275 __ mov_metadata(klass, method->method_holder()); // InstanceKlass* 2276 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 2277 2278 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 2279 2280 __ bind(L_skip_barrier); 2281 } 2282 2283 #ifdef COMPILER1 2284 // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available. 2285 if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) { 2286 inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/); 2287 } 2288 #endif // COMPILER1 2289 2290 // The instruction at the verified entry point must be 5 bytes or longer 2291 // because it can be patched on the fly by make_non_entrant. The stack bang 2292 // instruction fits that requirement. 2293 2294 // Generate stack overflow check 2295 __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size()); 2296 2297 // Generate a new frame for the wrapper. 2298 __ enter(); 2299 // -2 because return address is already present and so is saved rbp 2300 __ subptr(rsp, stack_size - 2*wordSize); 2301 2302 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2303 // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub 2304 bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */); 2305 2306 // Frame is now completed as far as size and linkage. 2307 int frame_complete = ((intptr_t)__ pc()) - start; 2308 2309 #ifdef ASSERT 2310 __ check_stack_alignment(rsp, "improperly aligned stack"); 2311 #endif /* ASSERT */ 2312 2313 2314 // We use r14 as the oop handle for the receiver/klass 2315 // It is callee save so it survives the call to native 2316 2317 const Register oop_handle_reg = r14; 2318 2319 // 2320 // We immediately shuffle the arguments so that any vm call we have to 2321 // make from here on out (sync slow path, jvmti, etc.) we will have 2322 // captured the oops from our caller and have a valid oopMap for 2323 // them. 2324 2325 // ----------------- 2326 // The Grand Shuffle 2327 2328 // The Java calling convention is either equal (linux) or denser (win64) than the 2329 // c calling convention. However the because of the jni_env argument the c calling 2330 // convention always has at least one more (and two for static) arguments than Java. 2331 // Therefore if we move the args from java -> c backwards then we will never have 2332 // a register->register conflict and we don't have to build a dependency graph 2333 // and figure out how to break any cycles. 2334 // 2335 2336 // Record esp-based slot for receiver on stack for non-static methods 2337 int receiver_offset = -1; 2338 2339 // This is a trick. We double the stack slots so we can claim 2340 // the oops in the caller's frame. Since we are sure to have 2341 // more args than the caller doubling is enough to make 2342 // sure we can capture all the incoming oop args from the 2343 // caller. 2344 // 2345 OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/); 2346 2347 // Mark location of rbp (someday) 2348 // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp)); 2349 2350 // Use eax, ebx as temporaries during any memory-memory moves we have to do 2351 // All inbound args are referenced based on rbp and all outbound args via rsp. 2352 2353 2354 #ifdef ASSERT 2355 bool reg_destroyed[Register::number_of_registers]; 2356 bool freg_destroyed[XMMRegister::number_of_registers]; 2357 for ( int r = 0 ; r < Register::number_of_registers ; r++ ) { 2358 reg_destroyed[r] = false; 2359 } 2360 for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) { 2361 freg_destroyed[f] = false; 2362 } 2363 2364 #endif /* ASSERT */ 2365 2366 // For JNI natives the incoming and outgoing registers are offset upwards. 2367 GrowableArray<int> arg_order(2 * total_in_args); 2368 2369 for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) { 2370 arg_order.push(i); 2371 arg_order.push(c_arg); 2372 } 2373 2374 for (int ai = 0; ai < arg_order.length(); ai += 2) { 2375 int i = arg_order.at(ai); 2376 int c_arg = arg_order.at(ai + 1); 2377 __ block_comment(err_msg("move %d -> %d", i, c_arg)); 2378 #ifdef ASSERT 2379 if (in_regs[i].first()->is_Register()) { 2380 assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!"); 2381 } else if (in_regs[i].first()->is_XMMRegister()) { 2382 assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!"); 2383 } 2384 if (out_regs[c_arg].first()->is_Register()) { 2385 reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true; 2386 } else if (out_regs[c_arg].first()->is_XMMRegister()) { 2387 freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true; 2388 } 2389 #endif /* ASSERT */ 2390 switch (in_sig_bt[i]) { 2391 case T_ARRAY: 2392 case T_OBJECT: 2393 __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg], 2394 ((i == 0) && (!is_static)), 2395 &receiver_offset); 2396 break; 2397 case T_VOID: 2398 break; 2399 2400 case T_FLOAT: 2401 __ float_move(in_regs[i], out_regs[c_arg]); 2402 break; 2403 2404 case T_DOUBLE: 2405 assert( i + 1 < total_in_args && 2406 in_sig_bt[i + 1] == T_VOID && 2407 out_sig_bt[c_arg+1] == T_VOID, "bad arg list"); 2408 __ double_move(in_regs[i], out_regs[c_arg]); 2409 break; 2410 2411 case T_LONG : 2412 __ long_move(in_regs[i], out_regs[c_arg]); 2413 break; 2414 2415 case T_ADDRESS: assert(false, "found T_ADDRESS in java args"); 2416 2417 default: 2418 __ move32_64(in_regs[i], out_regs[c_arg]); 2419 } 2420 } 2421 2422 int c_arg; 2423 2424 // Pre-load a static method's oop into r14. Used both by locking code and 2425 // the normal JNI call code. 2426 // point c_arg at the first arg that is already loaded in case we 2427 // need to spill before we call out 2428 c_arg = total_c_args - total_in_args; 2429 2430 if (method->is_static()) { 2431 2432 // load oop into a register 2433 __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror())); 2434 2435 // Now handlize the static class mirror it's known not-null. 2436 __ movptr(Address(rsp, klass_offset), oop_handle_reg); 2437 map->set_oop(VMRegImpl::stack2reg(klass_slot_offset)); 2438 2439 // Now get the handle 2440 __ lea(oop_handle_reg, Address(rsp, klass_offset)); 2441 // store the klass handle as second argument 2442 __ movptr(c_rarg1, oop_handle_reg); 2443 // and protect the arg if we must spill 2444 c_arg--; 2445 } 2446 2447 // Change state to native (we save the return address in the thread, since it might not 2448 // be pushed on the stack when we do a stack traversal). It is enough that the pc() 2449 // points into the right code segment. It does not have to be the correct return pc. 2450 // We use the same pc/oopMap repeatedly when we call out 2451 2452 Label native_return; 2453 if (LockingMode != LM_LEGACY && method->is_object_wait0()) { 2454 // For convenience we use the pc we want to resume to in case of preemption on Object.wait. 2455 __ set_last_Java_frame(rsp, noreg, native_return, rscratch1); 2456 } else { 2457 intptr_t the_pc = (intptr_t) __ pc(); 2458 oop_maps->add_gc_map(the_pc - start, map); 2459 2460 __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1); 2461 } 2462 2463 // We have all of the arguments setup at this point. We must not touch any register 2464 // argument registers at this point (what if we save/restore them there are no oop? 2465 2466 if (DTraceMethodProbes) { 2467 // protect the args we've loaded 2468 save_args(masm, total_c_args, c_arg, out_regs); 2469 __ mov_metadata(c_rarg1, method()); 2470 __ call_VM_leaf( 2471 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry), 2472 r15_thread, c_rarg1); 2473 restore_args(masm, total_c_args, c_arg, out_regs); 2474 } 2475 2476 // RedefineClasses() tracing support for obsolete method entry 2477 if (log_is_enabled(Trace, redefine, class, obsolete)) { 2478 // protect the args we've loaded 2479 save_args(masm, total_c_args, c_arg, out_regs); 2480 __ mov_metadata(c_rarg1, method()); 2481 __ call_VM_leaf( 2482 CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry), 2483 r15_thread, c_rarg1); 2484 restore_args(masm, total_c_args, c_arg, out_regs); 2485 } 2486 2487 // Lock a synchronized method 2488 2489 // Register definitions used by locking and unlocking 2490 2491 const Register swap_reg = rax; // Must use rax for cmpxchg instruction 2492 const Register obj_reg = rbx; // Will contain the oop 2493 const Register lock_reg = r13; // Address of compiler lock object (BasicLock) 2494 const Register old_hdr = r13; // value of old header at unlock time 2495 2496 Label slow_path_lock; 2497 Label lock_done; 2498 2499 if (method->is_synchronized()) { 2500 Label count_mon; 2501 2502 const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes(); 2503 2504 // Get the handle (the 2nd argument) 2505 __ mov(oop_handle_reg, c_rarg1); 2506 2507 // Get address of the box 2508 2509 __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2510 2511 // Load the oop from the handle 2512 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2513 2514 if (LockingMode == LM_MONITOR) { 2515 __ jmp(slow_path_lock); 2516 } else if (LockingMode == LM_LEGACY) { 2517 // Load immediate 1 into swap_reg %rax 2518 __ movl(swap_reg, 1); 2519 2520 // Load (object->mark() | 1) into swap_reg %rax 2521 __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2522 if (EnableValhalla) { 2523 // Mask inline_type bit such that we go to the slow path if object is an inline type 2524 __ andptr(swap_reg, ~((int) markWord::inline_type_bit_in_place)); 2525 } 2526 2527 // Save (object->mark() | 1) into BasicLock's displaced header 2528 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2529 2530 // src -> dest iff dest == rax else rax <- dest 2531 __ lock(); 2532 __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2533 __ jcc(Assembler::equal, count_mon); 2534 2535 // Hmm should this move to the slow path code area??? 2536 2537 // Test if the oopMark is an obvious stack pointer, i.e., 2538 // 1) (mark & 3) == 0, and 2539 // 2) rsp <= mark < mark + os::pagesize() 2540 // These 3 tests can be done by evaluating the following 2541 // expression: ((mark - rsp) & (3 - os::vm_page_size())), 2542 // assuming both stack pointer and pagesize have their 2543 // least significant 2 bits clear. 2544 // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg 2545 2546 __ subptr(swap_reg, rsp); 2547 __ andptr(swap_reg, 3 - (int)os::vm_page_size()); 2548 2549 // Save the test result, for recursive case, the result is zero 2550 __ movptr(Address(lock_reg, mark_word_offset), swap_reg); 2551 __ jcc(Assembler::notEqual, slow_path_lock); 2552 2553 __ bind(count_mon); 2554 __ inc_held_monitor_count(); 2555 } else { 2556 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2557 __ lightweight_lock(lock_reg, obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock); 2558 } 2559 2560 // Slow path will re-enter here 2561 __ bind(lock_done); 2562 } 2563 2564 // Finally just about ready to make the JNI call 2565 2566 // get JNIEnv* which is first argument to native 2567 __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset()))); 2568 2569 // Now set thread in native 2570 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native); 2571 2572 __ call(RuntimeAddress(native_func)); 2573 2574 // Verify or restore cpu control state after JNI call 2575 __ restore_cpu_control_state_after_jni(rscratch1); 2576 2577 // Unpack native results. 2578 switch (ret_type) { 2579 case T_BOOLEAN: __ c2bool(rax); break; 2580 case T_CHAR : __ movzwl(rax, rax); break; 2581 case T_BYTE : __ sign_extend_byte (rax); break; 2582 case T_SHORT : __ sign_extend_short(rax); break; 2583 case T_INT : /* nothing to do */ break; 2584 case T_DOUBLE : 2585 case T_FLOAT : 2586 // Result is in xmm0 we'll save as needed 2587 break; 2588 case T_ARRAY: // Really a handle 2589 case T_OBJECT: // Really a handle 2590 break; // can't de-handlize until after safepoint check 2591 case T_VOID: break; 2592 case T_LONG: break; 2593 default : ShouldNotReachHere(); 2594 } 2595 2596 // Switch thread to "native transition" state before reading the synchronization state. 2597 // This additional state is necessary because reading and testing the synchronization 2598 // state is not atomic w.r.t. GC, as this scenario demonstrates: 2599 // Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted. 2600 // VM thread changes sync state to synchronizing and suspends threads for GC. 2601 // Thread A is resumed to finish this native method, but doesn't block here since it 2602 // didn't see any synchronization is progress, and escapes. 2603 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans); 2604 2605 // Force this write out before the read below 2606 if (!UseSystemMemoryBarrier) { 2607 __ membar(Assembler::Membar_mask_bits( 2608 Assembler::LoadLoad | Assembler::LoadStore | 2609 Assembler::StoreLoad | Assembler::StoreStore)); 2610 } 2611 2612 // check for safepoint operation in progress and/or pending suspend requests 2613 { 2614 Label Continue; 2615 Label slow_path; 2616 2617 __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */); 2618 2619 __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0); 2620 __ jcc(Assembler::equal, Continue); 2621 __ bind(slow_path); 2622 2623 // Don't use call_VM as it will see a possible pending exception and forward it 2624 // and never return here preventing us from clearing _last_native_pc down below. 2625 // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are 2626 // preserved and correspond to the bcp/locals pointers. So we do a runtime call 2627 // by hand. 2628 // 2629 __ vzeroupper(); 2630 save_native_result(masm, ret_type, stack_slots); 2631 __ mov(c_rarg0, r15_thread); 2632 __ mov(r12, rsp); // remember sp 2633 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2634 __ andptr(rsp, -16); // align stack as required by ABI 2635 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans))); 2636 __ mov(rsp, r12); // restore sp 2637 __ reinit_heapbase(); 2638 // Restore any method result value 2639 restore_native_result(masm, ret_type, stack_slots); 2640 __ bind(Continue); 2641 } 2642 2643 // change thread state 2644 __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java); 2645 2646 if (LockingMode != LM_LEGACY && method->is_object_wait0()) { 2647 // Check preemption for Object.wait() 2648 __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset())); 2649 __ cmpptr(rscratch1, NULL_WORD); 2650 __ jccb(Assembler::equal, native_return); 2651 __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD); 2652 __ jmp(rscratch1); 2653 __ bind(native_return); 2654 2655 intptr_t the_pc = (intptr_t) __ pc(); 2656 oop_maps->add_gc_map(the_pc - start, map); 2657 } 2658 2659 2660 Label reguard; 2661 Label reguard_done; 2662 __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled); 2663 __ jcc(Assembler::equal, reguard); 2664 __ bind(reguard_done); 2665 2666 // native result if any is live 2667 2668 // Unlock 2669 Label slow_path_unlock; 2670 Label unlock_done; 2671 if (method->is_synchronized()) { 2672 2673 Label fast_done; 2674 2675 // Get locked oop from the handle we passed to jni 2676 __ movptr(obj_reg, Address(oop_handle_reg, 0)); 2677 2678 if (LockingMode == LM_LEGACY) { 2679 Label not_recur; 2680 // Simple recursive lock? 2681 __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD); 2682 __ jcc(Assembler::notEqual, not_recur); 2683 __ dec_held_monitor_count(); 2684 __ jmpb(fast_done); 2685 __ bind(not_recur); 2686 } 2687 2688 // Must save rax if it is live now because cmpxchg must use it 2689 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2690 save_native_result(masm, ret_type, stack_slots); 2691 } 2692 2693 if (LockingMode == LM_MONITOR) { 2694 __ jmp(slow_path_unlock); 2695 } else if (LockingMode == LM_LEGACY) { 2696 // get address of the stack lock 2697 __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2698 // get old displaced header 2699 __ movptr(old_hdr, Address(rax, 0)); 2700 2701 // Atomic swap old header if oop still contains the stack lock 2702 __ lock(); 2703 __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 2704 __ jcc(Assembler::notEqual, slow_path_unlock); 2705 __ dec_held_monitor_count(); 2706 } else { 2707 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 2708 __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock); 2709 } 2710 2711 // slow path re-enters here 2712 __ bind(unlock_done); 2713 if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) { 2714 restore_native_result(masm, ret_type, stack_slots); 2715 } 2716 2717 __ bind(fast_done); 2718 } 2719 if (DTraceMethodProbes) { 2720 save_native_result(masm, ret_type, stack_slots); 2721 __ mov_metadata(c_rarg1, method()); 2722 __ call_VM_leaf( 2723 CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit), 2724 r15_thread, c_rarg1); 2725 restore_native_result(masm, ret_type, stack_slots); 2726 } 2727 2728 __ reset_last_Java_frame(false); 2729 2730 // Unbox oop result, e.g. JNIHandles::resolve value. 2731 if (is_reference_type(ret_type)) { 2732 __ resolve_jobject(rax /* value */, 2733 r15_thread /* thread */, 2734 rcx /* tmp */); 2735 } 2736 2737 if (CheckJNICalls) { 2738 // clear_pending_jni_exception_check 2739 __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD); 2740 } 2741 2742 // reset handle block 2743 __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset())); 2744 __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD); 2745 2746 // pop our frame 2747 2748 __ leave(); 2749 2750 // Any exception pending? 2751 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2752 __ jcc(Assembler::notEqual, exception_pending); 2753 2754 // Return 2755 2756 __ ret(0); 2757 2758 // Unexpected paths are out of line and go here 2759 2760 // forward the exception 2761 __ bind(exception_pending); 2762 2763 // and forward the exception 2764 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 2765 2766 // Slow path locking & unlocking 2767 if (method->is_synchronized()) { 2768 2769 // BEGIN Slow path lock 2770 __ bind(slow_path_lock); 2771 2772 // has last_Java_frame setup. No exceptions so do vanilla call not call_VM 2773 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2774 2775 // protect the args we've loaded 2776 save_args(masm, total_c_args, c_arg, out_regs); 2777 2778 __ mov(c_rarg0, obj_reg); 2779 __ mov(c_rarg1, lock_reg); 2780 __ mov(c_rarg2, r15_thread); 2781 2782 // Not a leaf but we have last_Java_frame setup as we want. 2783 // We don't want to unmount in case of contention since that would complicate preserving 2784 // the arguments that had already been marshalled into the native convention. So we force 2785 // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame()) 2786 // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack. 2787 __ push_cont_fastpath(); 2788 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3); 2789 __ pop_cont_fastpath(); 2790 restore_args(masm, total_c_args, c_arg, out_regs); 2791 2792 #ifdef ASSERT 2793 { Label L; 2794 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2795 __ jcc(Assembler::equal, L); 2796 __ stop("no pending exception allowed on exit from monitorenter"); 2797 __ bind(L); 2798 } 2799 #endif 2800 __ jmp(lock_done); 2801 2802 // END Slow path lock 2803 2804 // BEGIN Slow path unlock 2805 __ bind(slow_path_unlock); 2806 2807 // If we haven't already saved the native result we must save it now as xmm registers 2808 // are still exposed. 2809 __ vzeroupper(); 2810 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2811 save_native_result(masm, ret_type, stack_slots); 2812 } 2813 2814 __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size)); 2815 2816 __ mov(c_rarg0, obj_reg); 2817 __ mov(c_rarg2, r15_thread); 2818 __ mov(r12, rsp); // remember sp 2819 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2820 __ andptr(rsp, -16); // align stack as required by ABI 2821 2822 // Save pending exception around call to VM (which contains an EXCEPTION_MARK) 2823 // NOTE that obj_reg == rbx currently 2824 __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset()))); 2825 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2826 2827 // args are (oop obj, BasicLock* lock, JavaThread* thread) 2828 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C))); 2829 __ mov(rsp, r12); // restore sp 2830 __ reinit_heapbase(); 2831 #ifdef ASSERT 2832 { 2833 Label L; 2834 __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD); 2835 __ jcc(Assembler::equal, L); 2836 __ stop("no pending exception allowed on exit complete_monitor_unlocking_C"); 2837 __ bind(L); 2838 } 2839 #endif /* ASSERT */ 2840 2841 __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx); 2842 2843 if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { 2844 restore_native_result(masm, ret_type, stack_slots); 2845 } 2846 __ jmp(unlock_done); 2847 2848 // END Slow path unlock 2849 2850 } // synchronized 2851 2852 // SLOW PATH Reguard the stack if needed 2853 2854 __ bind(reguard); 2855 __ vzeroupper(); 2856 save_native_result(masm, ret_type, stack_slots); 2857 __ mov(r12, rsp); // remember sp 2858 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 2859 __ andptr(rsp, -16); // align stack as required by ABI 2860 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); 2861 __ mov(rsp, r12); // restore sp 2862 __ reinit_heapbase(); 2863 restore_native_result(masm, ret_type, stack_slots); 2864 // and continue 2865 __ jmp(reguard_done); 2866 2867 2868 2869 __ flush(); 2870 2871 nmethod *nm = nmethod::new_native_nmethod(method, 2872 compile_id, 2873 masm->code(), 2874 vep_offset, 2875 frame_complete, 2876 stack_slots / VMRegImpl::slots_per_word, 2877 (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)), 2878 in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size), 2879 oop_maps); 2880 2881 return nm; 2882 } 2883 2884 // this function returns the adjust size (in number of words) to a c2i adapter 2885 // activation for use during deoptimization 2886 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) { 2887 return (callee_locals - callee_parameters) * Interpreter::stackElementWords; 2888 } 2889 2890 2891 uint SharedRuntime::out_preserve_stack_slots() { 2892 return 0; 2893 } 2894 2895 2896 // Number of stack slots between incoming argument block and the start of 2897 // a new frame. The PROLOG must add this many slots to the stack. The 2898 // EPILOG must remove this many slots. amd64 needs two slots for 2899 // return address. 2900 uint SharedRuntime::in_preserve_stack_slots() { 2901 return 4 + 2 * VerifyStackAtCalls; 2902 } 2903 2904 VMReg SharedRuntime::thread_register() { 2905 return r15_thread->as_VMReg(); 2906 } 2907 2908 //------------------------------generate_deopt_blob---------------------------- 2909 void SharedRuntime::generate_deopt_blob() { 2910 // Allocate space for the code 2911 ResourceMark rm; 2912 // Setup code generation tools 2913 int pad = 0; 2914 if (UseAVX > 2) { 2915 pad += 1024; 2916 } 2917 if (UseAPX) { 2918 pad += 1024; 2919 } 2920 #if INCLUDE_JVMCI 2921 if (EnableJVMCI) { 2922 pad += 512; // Increase the buffer size when compiling for JVMCI 2923 } 2924 #endif 2925 const char* name = SharedRuntime::stub_name(SharedStubId::deopt_id); 2926 CodeBuffer buffer(name, 2560+pad, 1024); 2927 MacroAssembler* masm = new MacroAssembler(&buffer); 2928 int frame_size_in_words; 2929 OopMap* map = nullptr; 2930 OopMapSet *oop_maps = new OopMapSet(); 2931 2932 // ------------- 2933 // This code enters when returning to a de-optimized nmethod. A return 2934 // address has been pushed on the stack, and return values are in 2935 // registers. 2936 // If we are doing a normal deopt then we were called from the patched 2937 // nmethod from the point we returned to the nmethod. So the return 2938 // address on the stack is wrong by NativeCall::instruction_size 2939 // We will adjust the value so it looks like we have the original return 2940 // address on the stack (like when we eagerly deoptimized). 2941 // In the case of an exception pending when deoptimizing, we enter 2942 // with a return address on the stack that points after the call we patched 2943 // into the exception handler. We have the following register state from, 2944 // e.g., the forward exception stub (see stubGenerator_x86_64.cpp). 2945 // rax: exception oop 2946 // rbx: exception handler 2947 // rdx: throwing pc 2948 // So in this case we simply jam rdx into the useless return address and 2949 // the stack looks just like we want. 2950 // 2951 // At this point we need to de-opt. We save the argument return 2952 // registers. We call the first C routine, fetch_unroll_info(). This 2953 // routine captures the return values and returns a structure which 2954 // describes the current frame size and the sizes of all replacement frames. 2955 // The current frame is compiled code and may contain many inlined 2956 // functions, each with their own JVM state. We pop the current frame, then 2957 // push all the new frames. Then we call the C routine unpack_frames() to 2958 // populate these frames. Finally unpack_frames() returns us the new target 2959 // address. Notice that callee-save registers are BLOWN here; they have 2960 // already been captured in the vframeArray at the time the return PC was 2961 // patched. 2962 address start = __ pc(); 2963 Label cont; 2964 2965 // Prolog for non exception case! 2966 2967 // Save everything in sight. 2968 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2969 2970 // Normal deoptimization. Save exec mode for unpack_frames. 2971 __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved 2972 __ jmp(cont); 2973 2974 int reexecute_offset = __ pc() - start; 2975 #if INCLUDE_JVMCI && !defined(COMPILER1) 2976 if (UseJVMCICompiler) { 2977 // JVMCI does not use this kind of deoptimization 2978 __ should_not_reach_here(); 2979 } 2980 #endif 2981 2982 // Reexecute case 2983 // return address is the pc describes what bci to do re-execute at 2984 2985 // No need to update map as each call to save_live_registers will produce identical oopmap 2986 (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 2987 2988 __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved 2989 __ jmp(cont); 2990 2991 #if INCLUDE_JVMCI 2992 Label after_fetch_unroll_info_call; 2993 int implicit_exception_uncommon_trap_offset = 0; 2994 int uncommon_trap_offset = 0; 2995 2996 if (EnableJVMCI) { 2997 implicit_exception_uncommon_trap_offset = __ pc() - start; 2998 2999 __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset()))); 3000 __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD); 3001 3002 uncommon_trap_offset = __ pc() - start; 3003 3004 // Save everything in sight. 3005 RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 3006 // fetch_unroll_info needs to call last_java_frame() 3007 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 3008 3009 __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset()))); 3010 __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1); 3011 3012 __ movl(r14, Deoptimization::Unpack_reexecute); 3013 __ mov(c_rarg0, r15_thread); 3014 __ movl(c_rarg2, r14); // exec mode 3015 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap))); 3016 oop_maps->add_gc_map( __ pc()-start, map->deep_copy()); 3017 3018 __ reset_last_Java_frame(false); 3019 3020 __ jmp(after_fetch_unroll_info_call); 3021 } // EnableJVMCI 3022 #endif // INCLUDE_JVMCI 3023 3024 int exception_offset = __ pc() - start; 3025 3026 // Prolog for exception case 3027 3028 // all registers are dead at this entry point, except for rax, and 3029 // rdx which contain the exception oop and exception pc 3030 // respectively. Set them in TLS and fall thru to the 3031 // unpack_with_exception_in_tls entry point. 3032 3033 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx); 3034 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax); 3035 3036 int exception_in_tls_offset = __ pc() - start; 3037 3038 // new implementation because exception oop is now passed in JavaThread 3039 3040 // Prolog for exception case 3041 // All registers must be preserved because they might be used by LinearScan 3042 // Exceptiop oop and throwing PC are passed in JavaThread 3043 // tos: stack at point of call to method that threw the exception (i.e. only 3044 // args are on the stack, no return address) 3045 3046 // make room on stack for the return address 3047 // It will be patched later with the throwing pc. The correct value is not 3048 // available now because loading it from memory would destroy registers. 3049 __ push(0); 3050 3051 // Save everything in sight. 3052 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true); 3053 3054 // Now it is safe to overwrite any register 3055 3056 // Deopt during an exception. Save exec mode for unpack_frames. 3057 __ movl(r14, Deoptimization::Unpack_exception); // callee-saved 3058 3059 // load throwing pc from JavaThread and patch it as the return address 3060 // of the current frame. Then clear the field in JavaThread 3061 3062 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 3063 __ movptr(Address(rbp, wordSize), rdx); 3064 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 3065 3066 #ifdef ASSERT 3067 // verify that there is really an exception oop in JavaThread 3068 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 3069 __ verify_oop(rax); 3070 3071 // verify that there is no pending exception 3072 Label no_pending_exception; 3073 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 3074 __ testptr(rax, rax); 3075 __ jcc(Assembler::zero, no_pending_exception); 3076 __ stop("must not have pending exception here"); 3077 __ bind(no_pending_exception); 3078 #endif 3079 3080 __ bind(cont); 3081 3082 // Call C code. Need thread and this frame, but NOT official VM entry 3083 // crud. We cannot block on this call, no GC can happen. 3084 // 3085 // UnrollBlock* fetch_unroll_info(JavaThread* thread) 3086 3087 // fetch_unroll_info needs to call last_java_frame(). 3088 3089 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 3090 #ifdef ASSERT 3091 { Label L; 3092 __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD); 3093 __ jcc(Assembler::equal, L); 3094 __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared"); 3095 __ bind(L); 3096 } 3097 #endif // ASSERT 3098 __ mov(c_rarg0, r15_thread); 3099 __ movl(c_rarg1, r14); // exec_mode 3100 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info))); 3101 3102 // Need to have an oopmap that tells fetch_unroll_info where to 3103 // find any register it might need. 3104 oop_maps->add_gc_map(__ pc() - start, map); 3105 3106 __ reset_last_Java_frame(false); 3107 3108 #if INCLUDE_JVMCI 3109 if (EnableJVMCI) { 3110 __ bind(after_fetch_unroll_info_call); 3111 } 3112 #endif 3113 3114 // Load UnrollBlock* into rdi 3115 __ mov(rdi, rax); 3116 3117 __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset())); 3118 Label noException; 3119 __ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending? 3120 __ jcc(Assembler::notEqual, noException); 3121 __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset())); 3122 // QQQ this is useless it was null above 3123 __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset())); 3124 __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD); 3125 __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD); 3126 3127 __ verify_oop(rax); 3128 3129 // Overwrite the result registers with the exception results. 3130 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 3131 // I think this is useless 3132 __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx); 3133 3134 __ bind(noException); 3135 3136 // Only register save data is on the stack. 3137 // Now restore the result registers. Everything else is either dead 3138 // or captured in the vframeArray. 3139 RegisterSaver::restore_result_registers(masm); 3140 3141 // All of the register save area has been popped of the stack. Only the 3142 // return address remains. 3143 3144 // Pop all the frames we must move/replace. 3145 // 3146 // Frame picture (youngest to oldest) 3147 // 1: self-frame (no frame link) 3148 // 2: deopting frame (no frame link) 3149 // 3: caller of deopting frame (could be compiled/interpreted). 3150 // 3151 // Note: by leaving the return address of self-frame on the stack 3152 // and using the size of frame 2 to adjust the stack 3153 // when we are done the return to frame 3 will still be on the stack. 3154 3155 // Pop deoptimized frame 3156 __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset())); 3157 __ addptr(rsp, rcx); 3158 3159 // rsp should be pointing at the return address to the caller (3) 3160 3161 // Pick up the initial fp we should save 3162 // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved) 3163 __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset())); 3164 3165 #ifdef ASSERT 3166 // Compilers generate code that bang the stack by as much as the 3167 // interpreter would need. So this stack banging should never 3168 // trigger a fault. Verify that it does not on non product builds. 3169 __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset())); 3170 __ bang_stack_size(rbx, rcx); 3171 #endif 3172 3173 // Load address of array of frame pcs into rcx 3174 __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset())); 3175 3176 // Trash the old pc 3177 __ addptr(rsp, wordSize); 3178 3179 // Load address of array of frame sizes into rsi 3180 __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset())); 3181 3182 // Load counter into rdx 3183 __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset())); 3184 3185 // Now adjust the caller's stack to make up for the extra locals 3186 // but record the original sp so that we can save it in the skeletal interpreter 3187 // frame and the stack walking of interpreter_sender will get the unextended sp 3188 // value and not the "real" sp value. 3189 3190 const Register sender_sp = r8; 3191 3192 __ mov(sender_sp, rsp); 3193 __ movl(rbx, Address(rdi, 3194 Deoptimization::UnrollBlock:: 3195 caller_adjustment_offset())); 3196 __ subptr(rsp, rbx); 3197 3198 // Push interpreter frames in a loop 3199 Label loop; 3200 __ bind(loop); 3201 __ movptr(rbx, Address(rsi, 0)); // Load frame size 3202 __ subptr(rbx, 2*wordSize); // We'll push pc and ebp by hand 3203 __ pushptr(Address(rcx, 0)); // Save return address 3204 __ enter(); // Save old & set new ebp 3205 __ subptr(rsp, rbx); // Prolog 3206 // This value is corrected by layout_activation_impl 3207 __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD); 3208 __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable 3209 __ mov(sender_sp, rsp); // Pass sender_sp to next frame 3210 __ addptr(rsi, wordSize); // Bump array pointer (sizes) 3211 __ addptr(rcx, wordSize); // Bump array pointer (pcs) 3212 __ decrementl(rdx); // Decrement counter 3213 __ jcc(Assembler::notZero, loop); 3214 __ pushptr(Address(rcx, 0)); // Save final return address 3215 3216 // Re-push self-frame 3217 __ enter(); // Save old & set new ebp 3218 3219 // Allocate a full sized register save area. 3220 // Return address and rbp are in place, so we allocate two less words. 3221 __ subptr(rsp, (frame_size_in_words - 2) * wordSize); 3222 3223 // Restore frame locals after moving the frame 3224 __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0); 3225 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 3226 3227 // Call C code. Need thread but NOT official VM entry 3228 // crud. We cannot block on this call, no GC can happen. Call should 3229 // restore return values to their stack-slots with the new SP. 3230 // 3231 // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode) 3232 3233 // Use rbp because the frames look interpreted now 3234 // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP. 3235 // Don't need the precise return PC here, just precise enough to point into this code blob. 3236 address the_pc = __ pc(); 3237 __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1); 3238 3239 __ andptr(rsp, -(StackAlignmentInBytes)); // Fix stack alignment as required by ABI 3240 __ mov(c_rarg0, r15_thread); 3241 __ movl(c_rarg1, r14); // second arg: exec_mode 3242 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames))); 3243 // Revert SP alignment after call since we're going to do some SP relative addressing below 3244 __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset())); 3245 3246 // Set an oopmap for the call site 3247 // Use the same PC we used for the last java frame 3248 oop_maps->add_gc_map(the_pc - start, 3249 new OopMap( frame_size_in_words, 0 )); 3250 3251 // Clear fp AND pc 3252 __ reset_last_Java_frame(true); 3253 3254 // Collect return values 3255 __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes())); 3256 __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes())); 3257 // I think this is useless (throwing pc?) 3258 __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes())); 3259 3260 // Pop self-frame. 3261 __ leave(); // Epilog 3262 3263 // Jump to interpreter 3264 __ ret(0); 3265 3266 // Make sure all code is generated 3267 masm->flush(); 3268 3269 _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words); 3270 _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset); 3271 #if INCLUDE_JVMCI 3272 if (EnableJVMCI) { 3273 _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset); 3274 _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset); 3275 } 3276 #endif 3277 } 3278 3279 //------------------------------generate_handler_blob------ 3280 // 3281 // Generate a special Compile2Runtime blob that saves all registers, 3282 // and setup oopmap. 3283 // 3284 SafepointBlob* SharedRuntime::generate_handler_blob(SharedStubId id, address call_ptr) { 3285 assert(StubRoutines::forward_exception_entry() != nullptr, 3286 "must be generated before"); 3287 assert(is_polling_page_id(id), "expected a polling page stub id"); 3288 3289 ResourceMark rm; 3290 OopMapSet *oop_maps = new OopMapSet(); 3291 OopMap* map; 3292 3293 // Allocate space for the code. Setup code generation tools. 3294 const char* name = SharedRuntime::stub_name(id); 3295 CodeBuffer buffer(name, 2548, 1024); 3296 MacroAssembler* masm = new MacroAssembler(&buffer); 3297 3298 address start = __ pc(); 3299 address call_pc = nullptr; 3300 int frame_size_in_words; 3301 bool cause_return = (id == SharedStubId::polling_page_return_handler_id); 3302 bool save_wide_vectors = (id == SharedStubId::polling_page_vectors_safepoint_handler_id); 3303 3304 // Make room for return address (or push it again) 3305 if (!cause_return) { 3306 __ push(rbx); 3307 } 3308 3309 // Save registers, fpu state, and flags 3310 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors); 3311 3312 // The following is basically a call_VM. However, we need the precise 3313 // address of the call in order to generate an oopmap. Hence, we do all the 3314 // work ourselves. 3315 3316 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next: 3317 3318 // The return address must always be correct so that frame constructor never 3319 // sees an invalid pc. 3320 3321 if (!cause_return) { 3322 // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack. 3323 // Additionally, rbx is a callee saved register and we can look at it later to determine 3324 // if someone changed the return address for us! 3325 __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset())); 3326 __ movptr(Address(rbp, wordSize), rbx); 3327 } 3328 3329 // Do the call 3330 __ mov(c_rarg0, r15_thread); 3331 __ call(RuntimeAddress(call_ptr)); 3332 3333 // Set an oopmap for the call site. This oopmap will map all 3334 // oop-registers and debug-info registers as callee-saved. This 3335 // will allow deoptimization at this safepoint to find all possible 3336 // debug-info recordings, as well as let GC find all oops. 3337 3338 oop_maps->add_gc_map( __ pc() - start, map); 3339 3340 Label noException; 3341 3342 __ reset_last_Java_frame(false); 3343 3344 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3345 __ jcc(Assembler::equal, noException); 3346 3347 // Exception pending 3348 3349 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3350 3351 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3352 3353 // No exception case 3354 __ bind(noException); 3355 3356 Label no_adjust; 3357 #ifdef ASSERT 3358 Label bail; 3359 #endif 3360 if (!cause_return) { 3361 Label no_prefix, not_special, check_rex_prefix; 3362 3363 // If our stashed return pc was modified by the runtime we avoid touching it 3364 __ cmpptr(rbx, Address(rbp, wordSize)); 3365 __ jcc(Assembler::notEqual, no_adjust); 3366 3367 // Skip over the poll instruction. 3368 // See NativeInstruction::is_safepoint_poll() 3369 // Possible encodings: 3370 // 85 00 test %eax,(%rax) 3371 // 85 01 test %eax,(%rcx) 3372 // 85 02 test %eax,(%rdx) 3373 // 85 03 test %eax,(%rbx) 3374 // 85 06 test %eax,(%rsi) 3375 // 85 07 test %eax,(%rdi) 3376 // 3377 // 41 85 00 test %eax,(%r8) 3378 // 41 85 01 test %eax,(%r9) 3379 // 41 85 02 test %eax,(%r10) 3380 // 41 85 03 test %eax,(%r11) 3381 // 41 85 06 test %eax,(%r14) 3382 // 41 85 07 test %eax,(%r15) 3383 // 3384 // 85 04 24 test %eax,(%rsp) 3385 // 41 85 04 24 test %eax,(%r12) 3386 // 85 45 00 test %eax,0x0(%rbp) 3387 // 41 85 45 00 test %eax,0x0(%r13) 3388 // 3389 // Notes: 3390 // Format of legacy MAP0 test instruction:- 3391 // [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32] 3392 // o For safepoint polling instruction "test %eax,(%rax)", encoding of first register 3393 // operand and base register of memory operand is b/w [0-8), hence we do not require 3394 // additional REX prefix where REX.B bit stores MSB bit of register encoding, which 3395 // is why two bytes encoding is sufficient here. 3396 // o For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE 3397 // register of memory operand is 1000, thus we need additional REX prefix in this case, 3398 // there by adding additional byte to instruction encoding. 3399 // o In case BASE register is one of the 32 extended GPR registers available only on targets 3400 // supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold 3401 // most significant two bits of 5 bit register encoding. 3402 3403 if (VM_Version::supports_apx_f()) { 3404 __ cmpb(Address(rbx, 0), Assembler::REX2); 3405 __ jccb(Assembler::notEqual, check_rex_prefix); 3406 __ addptr(rbx, 2); 3407 __ bind(check_rex_prefix); 3408 } 3409 __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix); 3410 __ jccb(Assembler::notEqual, no_prefix); 3411 __ addptr(rbx, 1); 3412 __ bind(no_prefix); 3413 #ifdef ASSERT 3414 __ movptr(rax, rbx); // remember where 0x85 should be, for verification below 3415 #endif 3416 // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values: 3417 // r12/rsp 0x04 3418 // r13/rbp 0x05 3419 __ movzbq(rcx, Address(rbx, 1)); 3420 __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05 3421 __ subptr(rcx, 4); // looking for 0x00 .. 0x01 3422 __ cmpptr(rcx, 1); 3423 __ jccb(Assembler::above, not_special); 3424 __ addptr(rbx, 1); 3425 __ bind(not_special); 3426 #ifdef ASSERT 3427 // Verify the correct encoding of the poll we're about to skip. 3428 __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl); 3429 __ jcc(Assembler::notEqual, bail); 3430 // Mask out the modrm bits 3431 __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask); 3432 // rax encodes to 0, so if the bits are nonzero it's incorrect 3433 __ jcc(Assembler::notZero, bail); 3434 #endif 3435 // Adjust return pc forward to step over the safepoint poll instruction 3436 __ addptr(rbx, 2); 3437 __ movptr(Address(rbp, wordSize), rbx); 3438 } 3439 3440 __ bind(no_adjust); 3441 // Normal exit, restore registers and exit. 3442 RegisterSaver::restore_live_registers(masm, save_wide_vectors); 3443 __ ret(0); 3444 3445 #ifdef ASSERT 3446 __ bind(bail); 3447 __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected"); 3448 #endif 3449 3450 // Make sure all code is generated 3451 masm->flush(); 3452 3453 // Fill-out other meta info 3454 return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words); 3455 } 3456 3457 // 3458 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss 3459 // 3460 // Generate a stub that calls into vm to find out the proper destination 3461 // of a java call. All the argument registers are live at this point 3462 // but since this is generic code we don't know what they are and the caller 3463 // must do any gc of the args. 3464 // 3465 RuntimeStub* SharedRuntime::generate_resolve_blob(SharedStubId id, address destination) { 3466 assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before"); 3467 assert(is_resolve_id(id), "expected a resolve stub id"); 3468 3469 // allocate space for the code 3470 ResourceMark rm; 3471 3472 const char* name = SharedRuntime::stub_name(id); 3473 CodeBuffer buffer(name, 1552, 512); 3474 MacroAssembler* masm = new MacroAssembler(&buffer); 3475 3476 int frame_size_in_words; 3477 3478 OopMapSet *oop_maps = new OopMapSet(); 3479 OopMap* map = nullptr; 3480 3481 int start = __ offset(); 3482 3483 // No need to save vector registers since they are caller-saved anyway. 3484 map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false); 3485 3486 int frame_complete = __ offset(); 3487 3488 __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1); 3489 3490 __ mov(c_rarg0, r15_thread); 3491 3492 __ call(RuntimeAddress(destination)); 3493 3494 3495 // Set an oopmap for the call site. 3496 // We need this not only for callee-saved registers, but also for volatile 3497 // registers that the compiler might be keeping live across a safepoint. 3498 3499 oop_maps->add_gc_map( __ offset() - start, map); 3500 3501 // rax contains the address we are going to jump to assuming no exception got installed 3502 3503 // clear last_Java_sp 3504 __ reset_last_Java_frame(false); 3505 // check for pending exceptions 3506 Label pending; 3507 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3508 __ jcc(Assembler::notEqual, pending); 3509 3510 // get the returned Method* 3511 __ get_vm_result_2(rbx, r15_thread); 3512 __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx); 3513 3514 __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax); 3515 3516 RegisterSaver::restore_live_registers(masm); 3517 3518 // We are back to the original state on entry and ready to go. 3519 3520 __ jmp(rax); 3521 3522 // Pending exception after the safepoint 3523 3524 __ bind(pending); 3525 3526 RegisterSaver::restore_live_registers(masm); 3527 3528 // exception pending => remove activation and forward to exception handler 3529 3530 __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD); 3531 3532 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 3533 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3534 3535 // ------------- 3536 // make sure all code is generated 3537 masm->flush(); 3538 3539 // return the blob 3540 // frame_size_words or bytes?? 3541 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); 3542 } 3543 3544 // Continuation point for throwing of implicit exceptions that are 3545 // not handled in the current activation. Fabricates an exception 3546 // oop and initiates normal exception dispatching in this 3547 // frame. Since we need to preserve callee-saved values (currently 3548 // only for C2, but done for C1 as well) we need a callee-saved oop 3549 // map and therefore have to make these stubs into RuntimeStubs 3550 // rather than BufferBlobs. If the compiler needs all registers to 3551 // be preserved between the fault point and the exception handler 3552 // then it must assume responsibility for that in 3553 // AbstractCompiler::continuation_for_implicit_null_exception or 3554 // continuation_for_implicit_division_by_zero_exception. All other 3555 // implicit exceptions (e.g., NullPointerException or 3556 // AbstractMethodError on entry) are either at call sites or 3557 // otherwise assume that stack unwinding will be initiated, so 3558 // caller saved registers were assumed volatile in the compiler. 3559 RuntimeStub* SharedRuntime::generate_throw_exception(SharedStubId id, address runtime_entry) { 3560 assert(is_throw_id(id), "expected a throw stub id"); 3561 3562 const char* name = SharedRuntime::stub_name(id); 3563 3564 // Information about frame layout at time of blocking runtime call. 3565 // Note that we only have to preserve callee-saved registers since 3566 // the compilers are responsible for supplying a continuation point 3567 // if they expect all registers to be preserved. 3568 enum layout { 3569 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt, 3570 rbp_off2, 3571 return_off, 3572 return_off2, 3573 framesize // inclusive of return address 3574 }; 3575 3576 int insts_size = 512; 3577 int locs_size = 64; 3578 3579 ResourceMark rm; 3580 const char* timer_msg = "SharedRuntime generate_throw_exception"; 3581 TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime)); 3582 3583 CodeBuffer code(name, insts_size, locs_size); 3584 OopMapSet* oop_maps = new OopMapSet(); 3585 MacroAssembler* masm = new MacroAssembler(&code); 3586 3587 address start = __ pc(); 3588 3589 // This is an inlined and slightly modified version of call_VM 3590 // which has the ability to fetch the return PC out of 3591 // thread-local storage and also sets up last_Java_sp slightly 3592 // differently than the real call_VM 3593 3594 __ enter(); // required for proper stackwalking of RuntimeStub frame 3595 3596 assert(is_even(framesize/2), "sp not 16-byte aligned"); 3597 3598 // return address and rbp are already in place 3599 __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog 3600 3601 int frame_complete = __ pc() - start; 3602 3603 // Set up last_Java_sp and last_Java_fp 3604 address the_pc = __ pc(); 3605 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 3606 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack 3607 3608 // Call runtime 3609 __ movptr(c_rarg0, r15_thread); 3610 BLOCK_COMMENT("call runtime_entry"); 3611 __ call(RuntimeAddress(runtime_entry)); 3612 3613 // Generate oop map 3614 OopMap* map = new OopMap(framesize, 0); 3615 3616 oop_maps->add_gc_map(the_pc - start, map); 3617 3618 __ reset_last_Java_frame(true); 3619 3620 __ leave(); // required for proper stackwalking of RuntimeStub frame 3621 3622 // check for pending exceptions 3623 #ifdef ASSERT 3624 Label L; 3625 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 3626 __ jcc(Assembler::notEqual, L); 3627 __ should_not_reach_here(); 3628 __ bind(L); 3629 #endif // ASSERT 3630 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3631 3632 3633 // codeBlob framesize is in words (not VMRegImpl::slot_size) 3634 RuntimeStub* stub = 3635 RuntimeStub::new_runtime_stub(name, 3636 &code, 3637 frame_complete, 3638 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3639 oop_maps, false); 3640 return stub; 3641 } 3642 3643 //------------------------------Montgomery multiplication------------------------ 3644 // 3645 3646 #ifndef _WINDOWS 3647 3648 // Subtract 0:b from carry:a. Return carry. 3649 static julong 3650 sub(julong a[], julong b[], julong carry, long len) { 3651 long long i = 0, cnt = len; 3652 julong tmp; 3653 asm volatile("clc; " 3654 "0: ; " 3655 "mov (%[b], %[i], 8), %[tmp]; " 3656 "sbb %[tmp], (%[a], %[i], 8); " 3657 "inc %[i]; dec %[cnt]; " 3658 "jne 0b; " 3659 "mov %[carry], %[tmp]; sbb $0, %[tmp]; " 3660 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp) 3661 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry) 3662 : "memory"); 3663 return tmp; 3664 } 3665 3666 // Multiply (unsigned) Long A by Long B, accumulating the double- 3667 // length result into the accumulator formed of T0, T1, and T2. 3668 #define MACC(A, B, T0, T1, T2) \ 3669 do { \ 3670 unsigned long hi, lo; \ 3671 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3672 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3673 : "r"(A), "a"(B) : "cc"); \ 3674 } while(0) 3675 3676 // As above, but add twice the double-length result into the 3677 // accumulator. 3678 #define MACC2(A, B, T0, T1, T2) \ 3679 do { \ 3680 unsigned long hi, lo; \ 3681 __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \ 3682 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 3683 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 3684 : "r"(A), "a"(B) : "cc"); \ 3685 } while(0) 3686 3687 #else //_WINDOWS 3688 3689 static julong 3690 sub(julong a[], julong b[], julong carry, long len) { 3691 long i; 3692 julong tmp; 3693 unsigned char c = 1; 3694 for (i = 0; i < len; i++) { 3695 c = _addcarry_u64(c, a[i], ~b[i], &tmp); 3696 a[i] = tmp; 3697 } 3698 c = _addcarry_u64(c, carry, ~0, &tmp); 3699 return tmp; 3700 } 3701 3702 // Multiply (unsigned) Long A by Long B, accumulating the double- 3703 // length result into the accumulator formed of T0, T1, and T2. 3704 #define MACC(A, B, T0, T1, T2) \ 3705 do { \ 3706 julong hi, lo; \ 3707 lo = _umul128(A, B, &hi); \ 3708 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3709 c = _addcarry_u64(c, hi, T1, &T1); \ 3710 _addcarry_u64(c, T2, 0, &T2); \ 3711 } while(0) 3712 3713 // As above, but add twice the double-length result into the 3714 // accumulator. 3715 #define MACC2(A, B, T0, T1, T2) \ 3716 do { \ 3717 julong hi, lo; \ 3718 lo = _umul128(A, B, &hi); \ 3719 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 3720 c = _addcarry_u64(c, hi, T1, &T1); \ 3721 _addcarry_u64(c, T2, 0, &T2); \ 3722 c = _addcarry_u64(0, lo, T0, &T0); \ 3723 c = _addcarry_u64(c, hi, T1, &T1); \ 3724 _addcarry_u64(c, T2, 0, &T2); \ 3725 } while(0) 3726 3727 #endif //_WINDOWS 3728 3729 // Fast Montgomery multiplication. The derivation of the algorithm is 3730 // in A Cryptographic Library for the Motorola DSP56000, 3731 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3732 3733 static void NOINLINE 3734 montgomery_multiply(julong a[], julong b[], julong n[], 3735 julong m[], julong inv, int len) { 3736 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3737 int i; 3738 3739 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply"); 3740 3741 for (i = 0; i < len; i++) { 3742 int j; 3743 for (j = 0; j < i; j++) { 3744 MACC(a[j], b[i-j], t0, t1, t2); 3745 MACC(m[j], n[i-j], t0, t1, t2); 3746 } 3747 MACC(a[i], b[0], t0, t1, t2); 3748 m[i] = t0 * inv; 3749 MACC(m[i], n[0], t0, t1, t2); 3750 3751 assert(t0 == 0, "broken Montgomery multiply"); 3752 3753 t0 = t1; t1 = t2; t2 = 0; 3754 } 3755 3756 for (i = len; i < 2*len; i++) { 3757 int j; 3758 for (j = i-len+1; j < len; j++) { 3759 MACC(a[j], b[i-j], t0, t1, t2); 3760 MACC(m[j], n[i-j], t0, t1, t2); 3761 } 3762 m[i-len] = t0; 3763 t0 = t1; t1 = t2; t2 = 0; 3764 } 3765 3766 while (t0) 3767 t0 = sub(m, n, t0, len); 3768 } 3769 3770 // Fast Montgomery squaring. This uses asymptotically 25% fewer 3771 // multiplies so it should be up to 25% faster than Montgomery 3772 // multiplication. However, its loop control is more complex and it 3773 // may actually run slower on some machines. 3774 3775 static void NOINLINE 3776 montgomery_square(julong a[], julong n[], 3777 julong m[], julong inv, int len) { 3778 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3779 int i; 3780 3781 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square"); 3782 3783 for (i = 0; i < len; i++) { 3784 int j; 3785 int end = (i+1)/2; 3786 for (j = 0; j < end; j++) { 3787 MACC2(a[j], a[i-j], t0, t1, t2); 3788 MACC(m[j], n[i-j], t0, t1, t2); 3789 } 3790 if ((i & 1) == 0) { 3791 MACC(a[j], a[j], t0, t1, t2); 3792 } 3793 for (; j < i; j++) { 3794 MACC(m[j], n[i-j], t0, t1, t2); 3795 } 3796 m[i] = t0 * inv; 3797 MACC(m[i], n[0], t0, t1, t2); 3798 3799 assert(t0 == 0, "broken Montgomery square"); 3800 3801 t0 = t1; t1 = t2; t2 = 0; 3802 } 3803 3804 for (i = len; i < 2*len; i++) { 3805 int start = i-len+1; 3806 int end = start + (len - start)/2; 3807 int j; 3808 for (j = start; j < end; j++) { 3809 MACC2(a[j], a[i-j], t0, t1, t2); 3810 MACC(m[j], n[i-j], t0, t1, t2); 3811 } 3812 if ((i & 1) == 0) { 3813 MACC(a[j], a[j], t0, t1, t2); 3814 } 3815 for (; j < len; j++) { 3816 MACC(m[j], n[i-j], t0, t1, t2); 3817 } 3818 m[i-len] = t0; 3819 t0 = t1; t1 = t2; t2 = 0; 3820 } 3821 3822 while (t0) 3823 t0 = sub(m, n, t0, len); 3824 } 3825 3826 // Swap words in a longword. 3827 static julong swap(julong x) { 3828 return (x << 32) | (x >> 32); 3829 } 3830 3831 // Copy len longwords from s to d, word-swapping as we go. The 3832 // destination array is reversed. 3833 static void reverse_words(julong *s, julong *d, int len) { 3834 d += len; 3835 while(len-- > 0) { 3836 d--; 3837 *d = swap(*s); 3838 s++; 3839 } 3840 } 3841 3842 // The threshold at which squaring is advantageous was determined 3843 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. 3844 #define MONTGOMERY_SQUARING_THRESHOLD 64 3845 3846 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, 3847 jint len, jlong inv, 3848 jint *m_ints) { 3849 assert(len % 2 == 0, "array length in montgomery_multiply must be even"); 3850 int longwords = len/2; 3851 3852 // Make very sure we don't use so much space that the stack might 3853 // overflow. 512 jints corresponds to an 16384-bit integer and 3854 // will use here a total of 8k bytes of stack space. 3855 int divisor = sizeof(julong) * 4; 3856 guarantee(longwords <= 8192 / divisor, "must be"); 3857 int total_allocation = longwords * sizeof (julong) * 4; 3858 julong *scratch = (julong *)alloca(total_allocation); 3859 3860 // Local scratch arrays 3861 julong 3862 *a = scratch + 0 * longwords, 3863 *b = scratch + 1 * longwords, 3864 *n = scratch + 2 * longwords, 3865 *m = scratch + 3 * longwords; 3866 3867 reverse_words((julong *)a_ints, a, longwords); 3868 reverse_words((julong *)b_ints, b, longwords); 3869 reverse_words((julong *)n_ints, n, longwords); 3870 3871 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords); 3872 3873 reverse_words(m, (julong *)m_ints, longwords); 3874 } 3875 3876 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, 3877 jint len, jlong inv, 3878 jint *m_ints) { 3879 assert(len % 2 == 0, "array length in montgomery_square must be even"); 3880 int longwords = len/2; 3881 3882 // Make very sure we don't use so much space that the stack might 3883 // overflow. 512 jints corresponds to an 16384-bit integer and 3884 // will use here a total of 6k bytes of stack space. 3885 int divisor = sizeof(julong) * 3; 3886 guarantee(longwords <= (8192 / divisor), "must be"); 3887 int total_allocation = longwords * sizeof (julong) * 3; 3888 julong *scratch = (julong *)alloca(total_allocation); 3889 3890 // Local scratch arrays 3891 julong 3892 *a = scratch + 0 * longwords, 3893 *n = scratch + 1 * longwords, 3894 *m = scratch + 2 * longwords; 3895 3896 reverse_words((julong *)a_ints, a, longwords); 3897 reverse_words((julong *)n_ints, n, longwords); 3898 3899 if (len >= MONTGOMERY_SQUARING_THRESHOLD) { 3900 ::montgomery_square(a, n, m, (julong)inv, longwords); 3901 } else { 3902 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords); 3903 } 3904 3905 reverse_words(m, (julong *)m_ints, longwords); 3906 } 3907 3908 BufferedInlineTypeBlob* SharedRuntime::generate_buffered_inline_type_adapter(const InlineKlass* vk) { 3909 BufferBlob* buf = BufferBlob::create("inline types pack/unpack", 16 * K); 3910 CodeBuffer buffer(buf); 3911 short buffer_locs[20]; 3912 buffer.insts()->initialize_shared_locs((relocInfo*)buffer_locs, 3913 sizeof(buffer_locs)/sizeof(relocInfo)); 3914 3915 MacroAssembler* masm = new MacroAssembler(&buffer); 3916 3917 const Array<SigEntry>* sig_vk = vk->extended_sig(); 3918 const Array<VMRegPair>* regs = vk->return_regs(); 3919 3920 int pack_fields_jobject_off = __ offset(); 3921 // Resolve pre-allocated buffer from JNI handle. 3922 // We cannot do this in generate_call_stub() because it requires GC code to be initialized. 3923 __ movptr(rax, Address(r13, 0)); 3924 __ resolve_jobject(rax /* value */, 3925 r15_thread /* thread */, 3926 r12 /* tmp */); 3927 __ movptr(Address(r13, 0), rax); 3928 3929 int pack_fields_off = __ offset(); 3930 3931 int j = 1; 3932 for (int i = 0; i < sig_vk->length(); i++) { 3933 BasicType bt = sig_vk->at(i)._bt; 3934 if (bt == T_METADATA) { 3935 continue; 3936 } 3937 if (bt == T_VOID) { 3938 if (sig_vk->at(i-1)._bt == T_LONG || 3939 sig_vk->at(i-1)._bt == T_DOUBLE) { 3940 j++; 3941 } 3942 continue; 3943 } 3944 int off = sig_vk->at(i)._offset; 3945 assert(off > 0, "offset in object should be positive"); 3946 VMRegPair pair = regs->at(j); 3947 VMReg r_1 = pair.first(); 3948 VMReg r_2 = pair.second(); 3949 Address to(rax, off); 3950 if (bt == T_FLOAT) { 3951 __ movflt(to, r_1->as_XMMRegister()); 3952 } else if (bt == T_DOUBLE) { 3953 __ movdbl(to, r_1->as_XMMRegister()); 3954 } else { 3955 Register val = r_1->as_Register(); 3956 assert_different_registers(to.base(), val, r14, r13, rbx, rscratch1); 3957 if (is_reference_type(bt)) { 3958 __ store_heap_oop(to, val, r14, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED); 3959 } else { 3960 __ store_sized_value(to, r_1->as_Register(), type2aelembytes(bt)); 3961 } 3962 } 3963 j++; 3964 } 3965 assert(j == regs->length(), "missed a field?"); 3966 if (vk->has_nullable_atomic_layout()) { 3967 // Set the null marker 3968 __ movb(Address(rax, vk->null_marker_offset()), 1); 3969 } 3970 __ ret(0); 3971 3972 int unpack_fields_off = __ offset(); 3973 3974 Label skip; 3975 __ testptr(rax, rax); 3976 __ jcc(Assembler::zero, skip); 3977 3978 j = 1; 3979 for (int i = 0; i < sig_vk->length(); i++) { 3980 BasicType bt = sig_vk->at(i)._bt; 3981 if (bt == T_METADATA) { 3982 continue; 3983 } 3984 if (bt == T_VOID) { 3985 if (sig_vk->at(i-1)._bt == T_LONG || 3986 sig_vk->at(i-1)._bt == T_DOUBLE) { 3987 j++; 3988 } 3989 continue; 3990 } 3991 int off = sig_vk->at(i)._offset; 3992 assert(off > 0, "offset in object should be positive"); 3993 VMRegPair pair = regs->at(j); 3994 VMReg r_1 = pair.first(); 3995 VMReg r_2 = pair.second(); 3996 Address from(rax, off); 3997 if (bt == T_FLOAT) { 3998 __ movflt(r_1->as_XMMRegister(), from); 3999 } else if (bt == T_DOUBLE) { 4000 __ movdbl(r_1->as_XMMRegister(), from); 4001 } else if (bt == T_OBJECT || bt == T_ARRAY) { 4002 assert_different_registers(rax, r_1->as_Register()); 4003 __ load_heap_oop(r_1->as_Register(), from); 4004 } else { 4005 assert(is_java_primitive(bt), "unexpected basic type"); 4006 assert_different_registers(rax, r_1->as_Register()); 4007 size_t size_in_bytes = type2aelembytes(bt); 4008 __ load_sized_value(r_1->as_Register(), from, size_in_bytes, bt != T_CHAR && bt != T_BOOLEAN); 4009 } 4010 j++; 4011 } 4012 assert(j == regs->length(), "missed a field?"); 4013 4014 __ bind(skip); 4015 __ ret(0); 4016 4017 __ flush(); 4018 4019 return BufferedInlineTypeBlob::create(&buffer, pack_fields_off, pack_fields_jobject_off, unpack_fields_off); 4020 } 4021 4022 #if INCLUDE_JFR 4023 4024 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint. 4025 // It returns a jobject handle to the event writer. 4026 // The handle is dereferenced and the return value is the event writer oop. 4027 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() { 4028 enum layout { 4029 rbp_off, 4030 rbpH_off, 4031 return_off, 4032 return_off2, 4033 framesize // inclusive of return address 4034 }; 4035 4036 const char* name = SharedRuntime::stub_name(SharedStubId::jfr_write_checkpoint_id); 4037 CodeBuffer code(name, 1024, 64); 4038 MacroAssembler* masm = new MacroAssembler(&code); 4039 address start = __ pc(); 4040 4041 __ enter(); 4042 address the_pc = __ pc(); 4043 4044 int frame_complete = the_pc - start; 4045 4046 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1); 4047 __ movptr(c_rarg0, r15_thread); 4048 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1); 4049 __ reset_last_Java_frame(true); 4050 4051 // rax is jobject handle result, unpack and process it through a barrier. 4052 __ resolve_global_jobject(rax, r15_thread, c_rarg0); 4053 4054 __ leave(); 4055 __ ret(0); 4056 4057 OopMapSet* oop_maps = new OopMapSet(); 4058 OopMap* map = new OopMap(framesize, 1); 4059 oop_maps->add_gc_map(frame_complete, map); 4060 4061 RuntimeStub* stub = 4062 RuntimeStub::new_runtime_stub(name, 4063 &code, 4064 frame_complete, 4065 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4066 oop_maps, 4067 false); 4068 return stub; 4069 } 4070 4071 // For c2: call to return a leased buffer. 4072 RuntimeStub* SharedRuntime::generate_jfr_return_lease() { 4073 enum layout { 4074 rbp_off, 4075 rbpH_off, 4076 return_off, 4077 return_off2, 4078 framesize // inclusive of return address 4079 }; 4080 4081 const char* name = SharedRuntime::stub_name(SharedStubId::jfr_return_lease_id); 4082 CodeBuffer code(name, 1024, 64); 4083 MacroAssembler* masm = new MacroAssembler(&code); 4084 address start = __ pc(); 4085 4086 __ enter(); 4087 address the_pc = __ pc(); 4088 4089 int frame_complete = the_pc - start; 4090 4091 __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2); 4092 __ movptr(c_rarg0, r15_thread); 4093 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1); 4094 __ reset_last_Java_frame(true); 4095 4096 __ leave(); 4097 __ ret(0); 4098 4099 OopMapSet* oop_maps = new OopMapSet(); 4100 OopMap* map = new OopMap(framesize, 1); 4101 oop_maps->add_gc_map(frame_complete, map); 4102 4103 RuntimeStub* stub = 4104 RuntimeStub::new_runtime_stub(name, 4105 &code, 4106 frame_complete, 4107 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4108 oop_maps, 4109 false); 4110 return stub; 4111 } 4112 4113 #endif // INCLUDE_JFR