1 /*
   2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef _WINDOWS
  26 #include "alloca.h"
  27 #endif
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "code/compiledIC.hpp"
  31 #include "code/debugInfoRec.hpp"
  32 #include "code/nativeInst.hpp"
  33 #include "code/vtableStubs.hpp"
  34 #include "compiler/oopMap.hpp"
  35 #include "gc/shared/collectedHeap.hpp"
  36 #include "gc/shared/gcLocker.hpp"
  37 #include "gc/shared/barrierSet.hpp"
  38 #include "gc/shared/barrierSetAssembler.hpp"
  39 #include "interpreter/interpreter.hpp"
  40 #include "logging/log.hpp"
  41 #include "memory/resourceArea.hpp"
  42 #include "memory/universe.hpp"
  43 #include "oops/klass.inline.hpp"
  44 #include "oops/method.inline.hpp"
  45 #include "prims/methodHandles.hpp"
  46 #include "runtime/continuation.hpp"
  47 #include "runtime/continuationEntry.inline.hpp"
  48 #include "runtime/globals.hpp"
  49 #include "runtime/jniHandles.hpp"
  50 #include "runtime/safepointMechanism.hpp"
  51 #include "runtime/sharedRuntime.hpp"
  52 #include "runtime/signature.hpp"
  53 #include "runtime/stubRoutines.hpp"
  54 #include "runtime/timerTrace.hpp"
  55 #include "runtime/vframeArray.hpp"
  56 #include "runtime/vm_version.hpp"
  57 #include "utilities/align.hpp"
  58 #include "utilities/checkedCast.hpp"
  59 #include "utilities/formatBuffer.hpp"
  60 #include "vmreg_x86.inline.hpp"
  61 #ifdef COMPILER1
  62 #include "c1/c1_Runtime1.hpp"
  63 #endif
  64 #ifdef COMPILER2
  65 #include "opto/runtime.hpp"
  66 #endif
  67 #if INCLUDE_JVMCI
  68 #include "jvmci/jvmciJavaClasses.hpp"
  69 #endif
  70 
  71 #define __ masm->
  72 
  73 #ifdef PRODUCT
  74 #define BLOCK_COMMENT(str) /* nothing */
  75 #else
  76 #define BLOCK_COMMENT(str) __ block_comment(str)
  77 #endif // PRODUCT
  78 
  79 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  80 
  81 class RegisterSaver {
  82   // Capture info about frame layout.  Layout offsets are in jint
  83   // units because compiler frame slots are jints.
  84 #define XSAVE_AREA_BEGIN 160
  85 #define XSAVE_AREA_YMM_BEGIN 576
  86 #define XSAVE_AREA_EGPRS 960
  87 #define XSAVE_AREA_OPMASK_BEGIN 1088
  88 #define XSAVE_AREA_ZMM_BEGIN 1152
  89 #define XSAVE_AREA_UPPERBANK 1664
  90 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  91 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  92 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
  93 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
  94 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
  95   enum layout {
  96     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt,    // fxsave save area
  97     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,  // offset in fxsave save area
  98     DEF_XMM_OFFS(0),
  99     DEF_XMM_OFFS(1),
 100     // 2..15 are implied in range usage
 101     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 102     DEF_YMM_OFFS(0),
 103     DEF_YMM_OFFS(1),
 104     r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
 105     r16H_off,
 106     r17_off, r17H_off,
 107     r18_off, r18H_off,
 108     r19_off, r19H_off,
 109     r20_off, r20H_off,
 110     r21_off, r21H_off,
 111     r22_off, r22H_off,
 112     r23_off, r23H_off,
 113     r24_off, r24H_off,
 114     r25_off, r25H_off,
 115     r26_off, r26H_off,
 116     r27_off, r27H_off,
 117     r28_off, r28H_off,
 118     r29_off, r29H_off,
 119     r30_off, r30H_off,
 120     r31_off, r31H_off,
 121     opmask_off   = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 122     DEF_OPMASK_OFFS(0),
 123     DEF_OPMASK_OFFS(1),
 124     // 2..7 are implied in range usage
 125     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 126     DEF_ZMM_OFFS(0),
 127     DEF_ZMM_OFFS(1),
 128     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 129     DEF_ZMM_UPPER_OFFS(16),
 130     DEF_ZMM_UPPER_OFFS(17),
 131     // 18..31 are implied in range usage
 132     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 133     fpu_stateH_end,
 134     r15_off, r15H_off,
 135     r14_off, r14H_off,
 136     r13_off, r13H_off,
 137     r12_off, r12H_off,
 138     r11_off, r11H_off,
 139     r10_off, r10H_off,
 140     r9_off,  r9H_off,
 141     r8_off,  r8H_off,
 142     rdi_off, rdiH_off,
 143     rsi_off, rsiH_off,
 144     ignore_off, ignoreH_off,  // extra copy of rbp
 145     rsp_off, rspH_off,
 146     rbx_off, rbxH_off,
 147     rdx_off, rdxH_off,
 148     rcx_off, rcxH_off,
 149     rax_off, raxH_off,
 150     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 151     align_off, alignH_off,
 152     flags_off, flagsH_off,
 153     // The frame sender code expects that rbp will be in the "natural" place and
 154     // will override any oopMap setting for it. We must therefore force the layout
 155     // so that it agrees with the frame sender code.
 156     rbp_off, rbpH_off,        // copy of rbp we will restore
 157     return_off, returnH_off,  // slot for return address
 158     reg_save_size             // size in compiler stack slots
 159   };
 160 
 161  public:
 162   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 163   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 164 
 165   // Offsets into the register save area
 166   // Used by deoptimization when it is managing result register
 167   // values on its own
 168 
 169   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 170   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 171   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 172   static int r15_offset_in_bytes(void)    { return BytesPerInt * r15_off; }
 173   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 174   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 175 
 176   // During deoptimization only the result registers need to be restored,
 177   // all the other values have already been extracted.
 178   static void restore_result_registers(MacroAssembler* masm);
 179 };
 180 
 181 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 182   int off = 0;
 183   int num_xmm_regs = XMMRegister::available_xmm_registers();
 184 #if COMPILER2_OR_JVMCI
 185   if (save_wide_vectors && UseAVX == 0) {
 186     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 187   }
 188   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 189 #else
 190   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 191 #endif
 192 
 193   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 194   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 195   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 196   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 197   // CodeBlob frame size is in words.
 198   int frame_size_in_words = frame_size_in_bytes / wordSize;
 199   *total_frame_words = frame_size_in_words;
 200 
 201   // Save registers, fpu state, and flags.
 202   // We assume caller has already pushed the return address onto the
 203   // stack, so rsp is 8-byte aligned here.
 204   // We push rpb twice in this sequence because we want the real rbp
 205   // to be under the return like a normal enter.
 206 
 207   __ enter();          // rsp becomes 16-byte aligned here
 208   __ pushf();
 209   // Make sure rsp stays 16-byte aligned
 210   __ subq(rsp, 8);
 211   // Push CPU state in multiple of 16 bytes
 212   __ save_legacy_gprs();
 213   __ push_FPU_state();
 214 
 215 
 216   // push cpu state handles this on EVEX enabled targets
 217   if (save_wide_vectors) {
 218     // Save upper half of YMM registers(0..15)
 219     int base_addr = XSAVE_AREA_YMM_BEGIN;
 220     for (int n = 0; n < 16; n++) {
 221       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 222     }
 223     if (VM_Version::supports_evex()) {
 224       // Save upper half of ZMM registers(0..15)
 225       base_addr = XSAVE_AREA_ZMM_BEGIN;
 226       for (int n = 0; n < 16; n++) {
 227         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 228       }
 229       // Save full ZMM registers(16..num_xmm_regs)
 230       base_addr = XSAVE_AREA_UPPERBANK;
 231       off = 0;
 232       int vector_len = Assembler::AVX_512bit;
 233       for (int n = 16; n < num_xmm_regs; n++) {
 234         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 235       }
 236 #if COMPILER2_OR_JVMCI
 237       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 238       off = 0;
 239       for(int n = 0; n < KRegister::number_of_registers; n++) {
 240         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 241       }
 242 #endif
 243     }
 244   } else {
 245     if (VM_Version::supports_evex()) {
 246       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 247       int base_addr = XSAVE_AREA_UPPERBANK;
 248       off = 0;
 249       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 250       for (int n = 16; n < num_xmm_regs; n++) {
 251         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 252       }
 253 #if COMPILER2_OR_JVMCI
 254       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 255       off = 0;
 256       for(int n = 0; n < KRegister::number_of_registers; n++) {
 257         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 258       }
 259 #endif
 260     }
 261   }
 262 
 263 #if COMPILER2_OR_JVMCI
 264   if (UseAPX) {
 265       int base_addr = XSAVE_AREA_EGPRS;
 266       off = 0;
 267       for (int n = 16; n < Register::number_of_registers; n++) {
 268         __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
 269       }
 270   }
 271 #endif
 272 
 273   __ vzeroupper();
 274   if (frame::arg_reg_save_area_bytes != 0) {
 275     // Allocate argument register save area
 276     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 277   }
 278 
 279   // Set an oopmap for the call site.  This oopmap will map all
 280   // oop-registers and debug-info registers as callee-saved.  This
 281   // will allow deoptimization at this safepoint to find all possible
 282   // debug-info recordings, as well as let GC find all oops.
 283 
 284   OopMapSet *oop_maps = new OopMapSet();
 285   OopMap* map = new OopMap(frame_size_in_slots, 0);
 286 
 287 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 288 
 289   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 290   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 291   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 292   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 293   // rbp location is known implicitly by the frame sender code, needs no oopmap
 294   // and the location where rbp was saved by is ignored
 295   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 296   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 297   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 298   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 299   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 300   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 301   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 302   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 303   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 304   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 305 
 306   if (UseAPX) {
 307     map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
 308     map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
 309     map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
 310     map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
 311     map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
 312     map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
 313     map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
 314     map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
 315     map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
 316     map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
 317     map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
 318     map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
 319     map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
 320     map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
 321     map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
 322     map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
 323   }
 324   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 325   // on EVEX enabled targets, we get it included in the xsave area
 326   off = xmm0_off;
 327   int delta = xmm1_off - off;
 328   for (int n = 0; n < 16; n++) {
 329     XMMRegister xmm_name = as_XMMRegister(n);
 330     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 331     off += delta;
 332   }
 333   if (UseAVX > 2) {
 334     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 335     off = zmm16_off;
 336     delta = zmm17_off - off;
 337     for (int n = 16; n < num_xmm_regs; n++) {
 338       XMMRegister zmm_name = as_XMMRegister(n);
 339       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 340       off += delta;
 341     }
 342   }
 343 
 344 #if COMPILER2_OR_JVMCI
 345   if (save_wide_vectors) {
 346     // Save upper half of YMM registers(0..15)
 347     off = ymm0_off;
 348     delta = ymm1_off - ymm0_off;
 349     for (int n = 0; n < 16; n++) {
 350       XMMRegister ymm_name = as_XMMRegister(n);
 351       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 352       off += delta;
 353     }
 354     if (VM_Version::supports_evex()) {
 355       // Save upper half of ZMM registers(0..15)
 356       off = zmm0_off;
 357       delta = zmm1_off - zmm0_off;
 358       for (int n = 0; n < 16; n++) {
 359         XMMRegister zmm_name = as_XMMRegister(n);
 360         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 361         off += delta;
 362       }
 363     }
 364   }
 365 #endif // COMPILER2_OR_JVMCI
 366 
 367   // %%% These should all be a waste but we'll keep things as they were for now
 368   if (true) {
 369     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 370     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 371     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 372     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 373     // rbp location is known implicitly by the frame sender code, needs no oopmap
 374     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 375     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 376     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 377     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 378     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 379     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 380     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 381     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 382     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 383     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 384     if (UseAPX) {
 385       map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
 386       map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
 387       map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
 388       map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
 389       map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
 390       map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
 391       map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
 392       map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
 393       map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
 394       map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
 395       map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
 396       map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
 397       map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
 398       map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
 399       map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
 400       map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
 401     }
 402     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 403     // on EVEX enabled targets, we get it included in the xsave area
 404     off = xmm0H_off;
 405     delta = xmm1H_off - off;
 406     for (int n = 0; n < 16; n++) {
 407       XMMRegister xmm_name = as_XMMRegister(n);
 408       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 409       off += delta;
 410     }
 411     if (UseAVX > 2) {
 412       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 413       off = zmm16H_off;
 414       delta = zmm17H_off - off;
 415       for (int n = 16; n < num_xmm_regs; n++) {
 416         XMMRegister zmm_name = as_XMMRegister(n);
 417         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 418         off += delta;
 419       }
 420     }
 421   }
 422 
 423   return map;
 424 }
 425 
 426 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 427   int num_xmm_regs = XMMRegister::available_xmm_registers();
 428   if (frame::arg_reg_save_area_bytes != 0) {
 429     // Pop arg register save area
 430     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 431   }
 432 
 433 #if COMPILER2_OR_JVMCI
 434   if (restore_wide_vectors) {
 435     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 436     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 437   }
 438 #else
 439   assert(!restore_wide_vectors, "vectors are generated only by C2");
 440 #endif
 441 
 442   __ vzeroupper();
 443 
 444   // On EVEX enabled targets everything is handled in pop fpu state
 445   if (restore_wide_vectors) {
 446     // Restore upper half of YMM registers (0..15)
 447     int base_addr = XSAVE_AREA_YMM_BEGIN;
 448     for (int n = 0; n < 16; n++) {
 449       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 450     }
 451     if (VM_Version::supports_evex()) {
 452       // Restore upper half of ZMM registers (0..15)
 453       base_addr = XSAVE_AREA_ZMM_BEGIN;
 454       for (int n = 0; n < 16; n++) {
 455         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 456       }
 457       // Restore full ZMM registers(16..num_xmm_regs)
 458       base_addr = XSAVE_AREA_UPPERBANK;
 459       int vector_len = Assembler::AVX_512bit;
 460       int off = 0;
 461       for (int n = 16; n < num_xmm_regs; n++) {
 462         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 463       }
 464 #if COMPILER2_OR_JVMCI
 465       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 466       off = 0;
 467       for (int n = 0; n < KRegister::number_of_registers; n++) {
 468         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 469       }
 470 #endif
 471     }
 472   } else {
 473     if (VM_Version::supports_evex()) {
 474       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 475       int base_addr = XSAVE_AREA_UPPERBANK;
 476       int off = 0;
 477       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 478       for (int n = 16; n < num_xmm_regs; n++) {
 479         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 480       }
 481 #if COMPILER2_OR_JVMCI
 482       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 483       off = 0;
 484       for (int n = 0; n < KRegister::number_of_registers; n++) {
 485         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 486       }
 487 #endif
 488     }
 489   }
 490 
 491 #if COMPILER2_OR_JVMCI
 492   if (UseAPX) {
 493     int base_addr = XSAVE_AREA_EGPRS;
 494     int off = 0;
 495     for (int n = 16; n < Register::number_of_registers; n++) {
 496       __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
 497     }
 498   }
 499 #endif
 500 
 501   // Recover CPU state
 502   __ pop_FPU_state();
 503   __ restore_legacy_gprs();
 504   __ addq(rsp, 8);
 505   __ popf();
 506   // Get the rbp described implicitly by the calling convention (no oopMap)
 507   __ pop(rbp);
 508 }
 509 
 510 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 511 
 512   // Just restore result register. Only used by deoptimization. By
 513   // now any callee save register that needs to be restored to a c2
 514   // caller of the deoptee has been extracted into the vframeArray
 515   // and will be stuffed into the c2i adapter we create for later
 516   // restoration so only result registers need to be restored here.
 517 
 518   // Restore fp result register
 519   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 520   // Restore integer result register
 521   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 522   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 523 
 524   // Pop all of the register save are off the stack except the return address
 525   __ addptr(rsp, return_offset_in_bytes());
 526 }
 527 
 528 // Is vector's size (in bytes) bigger than a size saved by default?
 529 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 530 bool SharedRuntime::is_wide_vector(int size) {
 531   return size > 16;
 532 }
 533 
 534 // ---------------------------------------------------------------------------
 535 // Read the array of BasicTypes from a signature, and compute where the
 536 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 537 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 538 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 539 // as framesizes are fixed.
 540 // VMRegImpl::stack0 refers to the first slot 0(sp).
 541 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 542 // Register up to Register::number_of_registers are the 64-bit
 543 // integer registers.
 544 
 545 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 546 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 547 // units regardless of build. Of course for i486 there is no 64 bit build
 548 
 549 // The Java calling convention is a "shifted" version of the C ABI.
 550 // By skipping the first C ABI register we can call non-static jni methods
 551 // with small numbers of arguments without having to shuffle the arguments
 552 // at all. Since we control the java ABI we ought to at least get some
 553 // advantage out of it.
 554 
 555 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 556                                            VMRegPair *regs,
 557                                            int total_args_passed) {
 558 
 559   // Create the mapping between argument positions and
 560   // registers.
 561   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 562     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 563   };
 564   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 565     j_farg0, j_farg1, j_farg2, j_farg3,
 566     j_farg4, j_farg5, j_farg6, j_farg7
 567   };
 568 
 569 
 570   uint int_args = 0;
 571   uint fp_args = 0;
 572   uint stk_args = 0;
 573 
 574   for (int i = 0; i < total_args_passed; i++) {
 575     switch (sig_bt[i]) {
 576     case T_BOOLEAN:
 577     case T_CHAR:
 578     case T_BYTE:
 579     case T_SHORT:
 580     case T_INT:
 581       if (int_args < Argument::n_int_register_parameters_j) {
 582         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 583       } else {
 584         stk_args = align_up(stk_args, 2);
 585         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 586         stk_args += 1;
 587       }
 588       break;
 589     case T_VOID:
 590       // halves of T_LONG or T_DOUBLE
 591       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 592       regs[i].set_bad();
 593       break;
 594     case T_LONG:
 595       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 596       // fall through
 597     case T_OBJECT:
 598     case T_ARRAY:
 599     case T_ADDRESS:
 600       if (int_args < Argument::n_int_register_parameters_j) {
 601         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 602       } else {
 603         stk_args = align_up(stk_args, 2);
 604         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 605         stk_args += 2;
 606       }
 607       break;
 608     case T_FLOAT:
 609       if (fp_args < Argument::n_float_register_parameters_j) {
 610         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 611       } else {
 612         stk_args = align_up(stk_args, 2);
 613         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 614         stk_args += 1;
 615       }
 616       break;
 617     case T_DOUBLE:
 618       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 619       if (fp_args < Argument::n_float_register_parameters_j) {
 620         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 621       } else {
 622         stk_args = align_up(stk_args, 2);
 623         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 624         stk_args += 2;
 625       }
 626       break;
 627     default:
 628       ShouldNotReachHere();
 629       break;
 630     }
 631   }
 632 
 633   return stk_args;
 634 }
 635 
 636 // Patch the callers callsite with entry to compiled code if it exists.
 637 static void patch_callers_callsite(MacroAssembler *masm) {
 638   Label L;
 639   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 640   __ jcc(Assembler::equal, L);
 641 
 642   // Save the current stack pointer
 643   __ mov(r13, rsp);
 644   // Schedule the branch target address early.
 645   // Call into the VM to patch the caller, then jump to compiled callee
 646   // rax isn't live so capture return address while we easily can
 647   __ movptr(rax, Address(rsp, 0));
 648 
 649   // align stack so push_CPU_state doesn't fault
 650   __ andptr(rsp, -(StackAlignmentInBytes));
 651   __ push_CPU_state();
 652   __ vzeroupper();
 653   // VM needs caller's callsite
 654   // VM needs target method
 655   // This needs to be a long call since we will relocate this adapter to
 656   // the codeBuffer and it may not reach
 657 
 658   // Allocate argument register save area
 659   if (frame::arg_reg_save_area_bytes != 0) {
 660     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 661   }
 662   __ mov(c_rarg0, rbx);
 663   __ mov(c_rarg1, rax);
 664   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 665 
 666   // De-allocate argument register save area
 667   if (frame::arg_reg_save_area_bytes != 0) {
 668     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 669   }
 670 
 671   __ vzeroupper();
 672   __ pop_CPU_state();
 673   // restore sp
 674   __ mov(rsp, r13);
 675   __ bind(L);
 676 }
 677 
 678 
 679 static void gen_c2i_adapter(MacroAssembler *masm,
 680                             int total_args_passed,
 681                             int comp_args_on_stack,
 682                             const BasicType *sig_bt,
 683                             const VMRegPair *regs,
 684                             Label& skip_fixup) {
 685   // Before we get into the guts of the C2I adapter, see if we should be here
 686   // at all.  We've come from compiled code and are attempting to jump to the
 687   // interpreter, which means the caller made a static call to get here
 688   // (vcalls always get a compiled target if there is one).  Check for a
 689   // compiled target.  If there is one, we need to patch the caller's call.
 690   patch_callers_callsite(masm);
 691 
 692   __ bind(skip_fixup);
 693 
 694   // Since all args are passed on the stack, total_args_passed *
 695   // Interpreter::stackElementSize is the space we need.
 696 
 697   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 698 
 699   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 700 
 701   // stack is aligned, keep it that way
 702   // This is not currently needed or enforced by the interpreter, but
 703   // we might as well conform to the ABI.
 704   extraspace = align_up(extraspace, 2*wordSize);
 705 
 706   // set senderSP value
 707   __ lea(r13, Address(rsp, wordSize));
 708 
 709 #ifdef ASSERT
 710   __ check_stack_alignment(r13, "sender stack not aligned");
 711 #endif
 712   if (extraspace > 0) {
 713     // Pop the return address
 714     __ pop(rax);
 715 
 716     __ subptr(rsp, extraspace);
 717 
 718     // Push the return address
 719     __ push(rax);
 720 
 721     // Account for the return address location since we store it first rather
 722     // than hold it in a register across all the shuffling
 723     extraspace += wordSize;
 724   }
 725 
 726 #ifdef ASSERT
 727   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 728 #endif
 729 
 730   // Now write the args into the outgoing interpreter space
 731   for (int i = 0; i < total_args_passed; i++) {
 732     if (sig_bt[i] == T_VOID) {
 733       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 734       continue;
 735     }
 736 
 737     // offset to start parameters
 738     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 739     int next_off = st_off - Interpreter::stackElementSize;
 740 
 741     // Say 4 args:
 742     // i   st_off
 743     // 0   32 T_LONG
 744     // 1   24 T_VOID
 745     // 2   16 T_OBJECT
 746     // 3    8 T_BOOL
 747     // -    0 return address
 748     //
 749     // However to make thing extra confusing. Because we can fit a long/double in
 750     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 751     // leaves one slot empty and only stores to a single slot. In this case the
 752     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 753 
 754     VMReg r_1 = regs[i].first();
 755     VMReg r_2 = regs[i].second();
 756     if (!r_1->is_valid()) {
 757       assert(!r_2->is_valid(), "");
 758       continue;
 759     }
 760     if (r_1->is_stack()) {
 761       // memory to memory use rax
 762       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 763       if (!r_2->is_valid()) {
 764         // sign extend??
 765         __ movl(rax, Address(rsp, ld_off));
 766         __ movptr(Address(rsp, st_off), rax);
 767 
 768       } else {
 769 
 770         __ movq(rax, Address(rsp, ld_off));
 771 
 772         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 773         // T_DOUBLE and T_LONG use two slots in the interpreter
 774         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 775           // ld_off == LSW, ld_off+wordSize == MSW
 776           // st_off == MSW, next_off == LSW
 777           __ movq(Address(rsp, next_off), rax);
 778 #ifdef ASSERT
 779           // Overwrite the unused slot with known junk
 780           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 781           __ movptr(Address(rsp, st_off), rax);
 782 #endif /* ASSERT */
 783         } else {
 784           __ movq(Address(rsp, st_off), rax);
 785         }
 786       }
 787     } else if (r_1->is_Register()) {
 788       Register r = r_1->as_Register();
 789       if (!r_2->is_valid()) {
 790         // must be only an int (or less ) so move only 32bits to slot
 791         // why not sign extend??
 792         __ movl(Address(rsp, st_off), r);
 793       } else {
 794         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 795         // T_DOUBLE and T_LONG use two slots in the interpreter
 796         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 797           // long/double in gpr
 798 #ifdef ASSERT
 799           // Overwrite the unused slot with known junk
 800           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 801           __ movptr(Address(rsp, st_off), rax);
 802 #endif /* ASSERT */
 803           __ movq(Address(rsp, next_off), r);
 804         } else {
 805           __ movptr(Address(rsp, st_off), r);
 806         }
 807       }
 808     } else {
 809       assert(r_1->is_XMMRegister(), "");
 810       if (!r_2->is_valid()) {
 811         // only a float use just part of the slot
 812         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 813       } else {
 814 #ifdef ASSERT
 815         // Overwrite the unused slot with known junk
 816         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 817         __ movptr(Address(rsp, st_off), rax);
 818 #endif /* ASSERT */
 819         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 820       }
 821     }
 822   }
 823 
 824   // Schedule the branch target address early.
 825   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 826   __ jmp(rcx);
 827 }
 828 
 829 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 830                         address code_start, address code_end,
 831                         Label& L_ok) {
 832   Label L_fail;
 833   __ lea(temp_reg, AddressLiteral(code_start, relocInfo::none));
 834   __ cmpptr(pc_reg, temp_reg);
 835   __ jcc(Assembler::belowEqual, L_fail);
 836   __ lea(temp_reg, AddressLiteral(code_end, relocInfo::none));
 837   __ cmpptr(pc_reg, temp_reg);
 838   __ jcc(Assembler::below, L_ok);
 839   __ bind(L_fail);
 840 }
 841 
 842 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 843                                     int total_args_passed,
 844                                     int comp_args_on_stack,
 845                                     const BasicType *sig_bt,
 846                                     const VMRegPair *regs) {
 847 
 848   // Note: r13 contains the senderSP on entry. We must preserve it since
 849   // we may do a i2c -> c2i transition if we lose a race where compiled
 850   // code goes non-entrant while we get args ready.
 851   // In addition we use r13 to locate all the interpreter args as
 852   // we must align the stack to 16 bytes on an i2c entry else we
 853   // lose alignment we expect in all compiled code and register
 854   // save code can segv when fxsave instructions find improperly
 855   // aligned stack pointer.
 856 
 857   // Adapters can be frameless because they do not require the caller
 858   // to perform additional cleanup work, such as correcting the stack pointer.
 859   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 860   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 861   // even if a callee has modified the stack pointer.
 862   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 863   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 864   // up via the senderSP register).
 865   // In other words, if *either* the caller or callee is interpreted, we can
 866   // get the stack pointer repaired after a call.
 867   // This is why c2i and i2c adapters cannot be indefinitely composed.
 868   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 869   // both caller and callee would be compiled methods, and neither would
 870   // clean up the stack pointer changes performed by the two adapters.
 871   // If this happens, control eventually transfers back to the compiled
 872   // caller, but with an uncorrected stack, causing delayed havoc.
 873 
 874   if (VerifyAdapterCalls &&
 875       (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) {
 876     // So, let's test for cascading c2i/i2c adapters right now.
 877     //  assert(Interpreter::contains($return_addr) ||
 878     //         StubRoutines::contains($return_addr),
 879     //         "i2c adapter must return to an interpreter frame");
 880     __ block_comment("verify_i2c { ");
 881     // Pick up the return address
 882     __ movptr(rax, Address(rsp, 0));
 883     Label L_ok;
 884     if (Interpreter::code() != nullptr) {
 885       range_check(masm, rax, r11,
 886                   Interpreter::code()->code_start(),
 887                   Interpreter::code()->code_end(),
 888                   L_ok);
 889     }
 890     if (StubRoutines::initial_stubs_code() != nullptr) {
 891       range_check(masm, rax, r11,
 892                   StubRoutines::initial_stubs_code()->code_begin(),
 893                   StubRoutines::initial_stubs_code()->code_end(),
 894                   L_ok);
 895     }
 896     if (StubRoutines::final_stubs_code() != nullptr) {
 897       range_check(masm, rax, r11,
 898                   StubRoutines::final_stubs_code()->code_begin(),
 899                   StubRoutines::final_stubs_code()->code_end(),
 900                   L_ok);
 901     }
 902     const char* msg = "i2c adapter must return to an interpreter frame";
 903     __ block_comment(msg);
 904     __ stop(msg);
 905     __ bind(L_ok);
 906     __ block_comment("} verify_i2ce ");
 907   }
 908 
 909   // Must preserve original SP for loading incoming arguments because
 910   // we need to align the outgoing SP for compiled code.
 911   __ movptr(r11, rsp);
 912 
 913   // Pick up the return address
 914   __ pop(rax);
 915 
 916   // Convert 4-byte c2 stack slots to words.
 917   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 918 
 919   if (comp_args_on_stack) {
 920     __ subptr(rsp, comp_words_on_stack * wordSize);
 921   }
 922 
 923   // Ensure compiled code always sees stack at proper alignment
 924   __ andptr(rsp, -16);
 925 
 926   // push the return address and misalign the stack that youngest frame always sees
 927   // as far as the placement of the call instruction
 928   __ push(rax);
 929 
 930   // Put saved SP in another register
 931   const Register saved_sp = rax;
 932   __ movptr(saved_sp, r11);
 933 
 934   // Will jump to the compiled code just as if compiled code was doing it.
 935   // Pre-load the register-jump target early, to schedule it better.
 936   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 937 
 938 #if INCLUDE_JVMCI
 939   if (EnableJVMCI) {
 940     // check if this call should be routed towards a specific entry point
 941     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 942     Label no_alternative_target;
 943     __ jcc(Assembler::equal, no_alternative_target);
 944     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 945     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 946     __ bind(no_alternative_target);
 947   }
 948 #endif // INCLUDE_JVMCI
 949 
 950   // Now generate the shuffle code.  Pick up all register args and move the
 951   // rest through the floating point stack top.
 952   for (int i = 0; i < total_args_passed; i++) {
 953     if (sig_bt[i] == T_VOID) {
 954       // Longs and doubles are passed in native word order, but misaligned
 955       // in the 32-bit build.
 956       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 957       continue;
 958     }
 959 
 960     // Pick up 0, 1 or 2 words from SP+offset.
 961 
 962     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 963             "scrambled load targets?");
 964     // Load in argument order going down.
 965     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 966     // Point to interpreter value (vs. tag)
 967     int next_off = ld_off - Interpreter::stackElementSize;
 968     //
 969     //
 970     //
 971     VMReg r_1 = regs[i].first();
 972     VMReg r_2 = regs[i].second();
 973     if (!r_1->is_valid()) {
 974       assert(!r_2->is_valid(), "");
 975       continue;
 976     }
 977     if (r_1->is_stack()) {
 978       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 979       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 980 
 981       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 982       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 983       // will be generated.
 984       if (!r_2->is_valid()) {
 985         // sign extend???
 986         __ movl(r13, Address(saved_sp, ld_off));
 987         __ movptr(Address(rsp, st_off), r13);
 988       } else {
 989         //
 990         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 991         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 992         // So we must adjust where to pick up the data to match the interpreter.
 993         //
 994         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 995         // are accessed as negative so LSW is at LOW address
 996 
 997         // ld_off is MSW so get LSW
 998         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 999                            next_off : ld_off;
1000         __ movq(r13, Address(saved_sp, offset));
1001         // st_off is LSW (i.e. reg.first())
1002         __ movq(Address(rsp, st_off), r13);
1003       }
1004     } else if (r_1->is_Register()) {  // Register argument
1005       Register r = r_1->as_Register();
1006       assert(r != rax, "must be different");
1007       if (r_2->is_valid()) {
1008         //
1009         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1010         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1011         // So we must adjust where to pick up the data to match the interpreter.
1012 
1013         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
1014                            next_off : ld_off;
1015 
1016         // this can be a misaligned move
1017         __ movq(r, Address(saved_sp, offset));
1018       } else {
1019         // sign extend and use a full word?
1020         __ movl(r, Address(saved_sp, ld_off));
1021       }
1022     } else {
1023       if (!r_2->is_valid()) {
1024         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
1025       } else {
1026         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
1027       }
1028     }
1029   }
1030 
1031   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
1032 
1033   // 6243940 We might end up in handle_wrong_method if
1034   // the callee is deoptimized as we race thru here. If that
1035   // happens we don't want to take a safepoint because the
1036   // caller frame will look interpreted and arguments are now
1037   // "compiled" so it is much better to make this transition
1038   // invisible to the stack walking code. Unfortunately if
1039   // we try and find the callee by normal means a safepoint
1040   // is possible. So we stash the desired callee in the thread
1041   // and the vm will find there should this case occur.
1042 
1043   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1044 
1045   // put Method* where a c2i would expect should we end up there
1046   // only needed because eof c2 resolve stubs return Method* as a result in
1047   // rax
1048   __ mov(rax, rbx);
1049   __ jmp(r11);
1050 }
1051 
1052 // ---------------------------------------------------------------
1053 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
1054                                                             int total_args_passed,
1055                                                             int comp_args_on_stack,
1056                                                             const BasicType *sig_bt,
1057                                                             const VMRegPair *regs,
1058                                                             AdapterFingerPrint* fingerprint) {
1059   address i2c_entry = __ pc();
1060 
1061   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
1062 
1063   // -------------------------------------------------------------------------
1064   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1065   // to the interpreter.  The args start out packed in the compiled layout.  They
1066   // need to be unpacked into the interpreter layout.  This will almost always
1067   // require some stack space.  We grow the current (compiled) stack, then repack
1068   // the args.  We  finally end in a jump to the generic interpreter entry point.
1069   // On exit from the interpreter, the interpreter will restore our SP (lest the
1070   // compiled code, which relies solely on SP and not RBP, get sick).
1071 
1072   address c2i_unverified_entry = __ pc();
1073   Label skip_fixup;
1074 
1075   Register data = rax;
1076   Register receiver = j_rarg0;
1077   Register temp = rbx;
1078 
1079   {
1080     __ ic_check(1 /* end_alignment */);
1081     __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1082     // Method might have been compiled since the call site was patched to
1083     // interpreted if that is the case treat it as a miss so we can get
1084     // the call site corrected.
1085     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1086     __ jcc(Assembler::equal, skip_fixup);
1087     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1088   }
1089 
1090   address c2i_entry = __ pc();
1091 
1092   // Class initialization barrier for static methods
1093   address c2i_no_clinit_check_entry = nullptr;
1094   if (VM_Version::supports_fast_class_init_checks()) {
1095     Label L_skip_barrier;
1096     Register method = rbx;
1097 
1098     { // Bypass the barrier for non-static methods
1099       Register flags = rscratch1;
1100       __ load_unsigned_short(flags, Address(method, Method::access_flags_offset()));
1101       __ testl(flags, JVM_ACC_STATIC);
1102       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1103     }
1104 
1105     Register klass = rscratch1;
1106     __ load_method_holder(klass, method);
1107     __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
1108 
1109     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1110 
1111     __ bind(L_skip_barrier);
1112     c2i_no_clinit_check_entry = __ pc();
1113   }
1114 
1115   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1116   bs->c2i_entry_barrier(masm);
1117 
1118   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1119 
1120   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1121 }
1122 
1123 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1124                                          VMRegPair *regs,
1125                                          int total_args_passed) {
1126 
1127 // We return the amount of VMRegImpl stack slots we need to reserve for all
1128 // the arguments NOT counting out_preserve_stack_slots.
1129 
1130 // NOTE: These arrays will have to change when c1 is ported
1131 #ifdef _WIN64
1132     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1133       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1134     };
1135     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1136       c_farg0, c_farg1, c_farg2, c_farg3
1137     };
1138 #else
1139     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1140       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1141     };
1142     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1143       c_farg0, c_farg1, c_farg2, c_farg3,
1144       c_farg4, c_farg5, c_farg6, c_farg7
1145     };
1146 #endif // _WIN64
1147 
1148 
1149     uint int_args = 0;
1150     uint fp_args = 0;
1151     uint stk_args = 0; // inc by 2 each time
1152 
1153     for (int i = 0; i < total_args_passed; i++) {
1154       switch (sig_bt[i]) {
1155       case T_BOOLEAN:
1156       case T_CHAR:
1157       case T_BYTE:
1158       case T_SHORT:
1159       case T_INT:
1160         if (int_args < Argument::n_int_register_parameters_c) {
1161           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1162 #ifdef _WIN64
1163           fp_args++;
1164           // Allocate slots for callee to stuff register args the stack.
1165           stk_args += 2;
1166 #endif
1167         } else {
1168           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1169           stk_args += 2;
1170         }
1171         break;
1172       case T_LONG:
1173         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1174         // fall through
1175       case T_OBJECT:
1176       case T_ARRAY:
1177       case T_ADDRESS:
1178       case T_METADATA:
1179         if (int_args < Argument::n_int_register_parameters_c) {
1180           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1181 #ifdef _WIN64
1182           fp_args++;
1183           stk_args += 2;
1184 #endif
1185         } else {
1186           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1187           stk_args += 2;
1188         }
1189         break;
1190       case T_FLOAT:
1191         if (fp_args < Argument::n_float_register_parameters_c) {
1192           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1193 #ifdef _WIN64
1194           int_args++;
1195           // Allocate slots for callee to stuff register args the stack.
1196           stk_args += 2;
1197 #endif
1198         } else {
1199           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1200           stk_args += 2;
1201         }
1202         break;
1203       case T_DOUBLE:
1204         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1205         if (fp_args < Argument::n_float_register_parameters_c) {
1206           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1207 #ifdef _WIN64
1208           int_args++;
1209           // Allocate slots for callee to stuff register args the stack.
1210           stk_args += 2;
1211 #endif
1212         } else {
1213           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1214           stk_args += 2;
1215         }
1216         break;
1217       case T_VOID: // Halves of longs and doubles
1218         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1219         regs[i].set_bad();
1220         break;
1221       default:
1222         ShouldNotReachHere();
1223         break;
1224       }
1225     }
1226 #ifdef _WIN64
1227   // windows abi requires that we always allocate enough stack space
1228   // for 4 64bit registers to be stored down.
1229   if (stk_args < 8) {
1230     stk_args = 8;
1231   }
1232 #endif // _WIN64
1233 
1234   return stk_args;
1235 }
1236 
1237 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1238                                              uint num_bits,
1239                                              uint total_args_passed) {
1240   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1241          "only certain vector sizes are supported for now");
1242 
1243   static const XMMRegister VEC_ArgReg[32] = {
1244      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1245      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1246     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1247     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1248   };
1249 
1250   uint stk_args = 0;
1251   uint fp_args = 0;
1252 
1253   for (uint i = 0; i < total_args_passed; i++) {
1254     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1255     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1256     regs[i].set_pair(vmreg->next(next_val), vmreg);
1257   }
1258 
1259   return stk_args;
1260 }
1261 
1262 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1263   // We always ignore the frame_slots arg and just use the space just below frame pointer
1264   // which by this time is free to use
1265   switch (ret_type) {
1266   case T_FLOAT:
1267     __ movflt(Address(rbp, -wordSize), xmm0);
1268     break;
1269   case T_DOUBLE:
1270     __ movdbl(Address(rbp, -wordSize), xmm0);
1271     break;
1272   case T_VOID:  break;
1273   default: {
1274     __ movptr(Address(rbp, -wordSize), rax);
1275     }
1276   }
1277 }
1278 
1279 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1280   // We always ignore the frame_slots arg and just use the space just below frame pointer
1281   // which by this time is free to use
1282   switch (ret_type) {
1283   case T_FLOAT:
1284     __ movflt(xmm0, Address(rbp, -wordSize));
1285     break;
1286   case T_DOUBLE:
1287     __ movdbl(xmm0, Address(rbp, -wordSize));
1288     break;
1289   case T_VOID:  break;
1290   default: {
1291     __ movptr(rax, Address(rbp, -wordSize));
1292     }
1293   }
1294 }
1295 
1296 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1297     for ( int i = first_arg ; i < arg_count ; i++ ) {
1298       if (args[i].first()->is_Register()) {
1299         __ push(args[i].first()->as_Register());
1300       } else if (args[i].first()->is_XMMRegister()) {
1301         __ subptr(rsp, 2*wordSize);
1302         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1303       }
1304     }
1305 }
1306 
1307 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1308     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1309       if (args[i].first()->is_Register()) {
1310         __ pop(args[i].first()->as_Register());
1311       } else if (args[i].first()->is_XMMRegister()) {
1312         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1313         __ addptr(rsp, 2*wordSize);
1314       }
1315     }
1316 }
1317 
1318 static void verify_oop_args(MacroAssembler* masm,
1319                             const methodHandle& method,
1320                             const BasicType* sig_bt,
1321                             const VMRegPair* regs) {
1322   Register temp_reg = rbx;  // not part of any compiled calling seq
1323   if (VerifyOops) {
1324     for (int i = 0; i < method->size_of_parameters(); i++) {
1325       if (is_reference_type(sig_bt[i])) {
1326         VMReg r = regs[i].first();
1327         assert(r->is_valid(), "bad oop arg");
1328         if (r->is_stack()) {
1329           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1330           __ verify_oop(temp_reg);
1331         } else {
1332           __ verify_oop(r->as_Register());
1333         }
1334       }
1335     }
1336   }
1337 }
1338 
1339 static void check_continuation_enter_argument(VMReg actual_vmreg,
1340                                               Register expected_reg,
1341                                               const char* name) {
1342   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1343   assert(actual_vmreg->as_Register() == expected_reg,
1344          "%s is in unexpected register: %s instead of %s",
1345          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1346 }
1347 
1348 
1349 //---------------------------- continuation_enter_setup ---------------------------
1350 //
1351 // Arguments:
1352 //   None.
1353 //
1354 // Results:
1355 //   rsp: pointer to blank ContinuationEntry
1356 //
1357 // Kills:
1358 //   rax
1359 //
1360 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1361   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1362   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1363   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1364 
1365   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1366   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1367 
1368   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1369   OopMap* map = new OopMap(frame_size, 0);
1370 
1371   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1372   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1373   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1374 
1375   return map;
1376 }
1377 
1378 //---------------------------- fill_continuation_entry ---------------------------
1379 //
1380 // Arguments:
1381 //   rsp: pointer to blank Continuation entry
1382 //   reg_cont_obj: pointer to the continuation
1383 //   reg_flags: flags
1384 //
1385 // Results:
1386 //   rsp: pointer to filled out ContinuationEntry
1387 //
1388 // Kills:
1389 //   rax
1390 //
1391 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1392   assert_different_registers(rax, reg_cont_obj, reg_flags);
1393 #ifdef ASSERT
1394   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1395 #endif
1396   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1397   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1398   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1399   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1400   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1401 
1402   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1403   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1404   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1405   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1406 
1407   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1408   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1409 }
1410 
1411 //---------------------------- continuation_enter_cleanup ---------------------------
1412 //
1413 // Arguments:
1414 //   rsp: pointer to the ContinuationEntry
1415 //
1416 // Results:
1417 //   rsp: pointer to the spilled rbp in the entry frame
1418 //
1419 // Kills:
1420 //   rbx
1421 //
1422 static void continuation_enter_cleanup(MacroAssembler* masm) {
1423 #ifdef ASSERT
1424   Label L_good_sp;
1425   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1426   __ jcc(Assembler::equal, L_good_sp);
1427   __ stop("Incorrect rsp at continuation_enter_cleanup");
1428   __ bind(L_good_sp);
1429 #endif
1430   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1431   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1432 
1433   if (CheckJNICalls) {
1434     // Check if this is a virtual thread continuation
1435     Label L_skip_vthread_code;
1436     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1437     __ jcc(Assembler::equal, L_skip_vthread_code);
1438 
1439     // If the held monitor count is > 0 and this vthread is terminating then
1440     // it failed to release a JNI monitor. So we issue the same log message
1441     // that JavaThread::exit does.
1442     __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1443     __ jcc(Assembler::equal, L_skip_vthread_code);
1444 
1445     // rax may hold an exception oop, save it before the call
1446     __ push(rax);
1447     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held));
1448     __ pop(rax);
1449 
1450     // For vthreads we have to explicitly zero the JNI monitor count of the carrier
1451     // on termination. The held count is implicitly zeroed below when we restore from
1452     // the parent held count (which has to be zero).
1453     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1454 
1455     __ bind(L_skip_vthread_code);
1456   }
1457 #ifdef ASSERT
1458   else {
1459     // Check if this is a virtual thread continuation
1460     Label L_skip_vthread_code;
1461     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1462     __ jcc(Assembler::equal, L_skip_vthread_code);
1463 
1464     // See comment just above. If not checking JNI calls the JNI count is only
1465     // needed for assertion checking.
1466     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1467 
1468     __ bind(L_skip_vthread_code);
1469   }
1470 #endif
1471 
1472   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1473   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1474 
1475   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1476   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1477   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1478 }
1479 
1480 static void gen_continuation_enter(MacroAssembler* masm,
1481                                    const VMRegPair* regs,
1482                                    int& exception_offset,
1483                                    OopMapSet* oop_maps,
1484                                    int& frame_complete,
1485                                    int& stack_slots,
1486                                    int& interpreted_entry_offset,
1487                                    int& compiled_entry_offset) {
1488 
1489   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1490   int pos_cont_obj   = 0;
1491   int pos_is_cont    = 1;
1492   int pos_is_virtual = 2;
1493 
1494   // The platform-specific calling convention may present the arguments in various registers.
1495   // To simplify the rest of the code, we expect the arguments to reside at these known
1496   // registers, and we additionally check the placement here in case calling convention ever
1497   // changes.
1498   Register reg_cont_obj   = c_rarg1;
1499   Register reg_is_cont    = c_rarg2;
1500   Register reg_is_virtual = c_rarg3;
1501 
1502   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1503   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1504   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1505 
1506   // Utility methods kill rax, make sure there are no collisions
1507   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1508 
1509   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1510                          relocInfo::static_call_type);
1511 
1512   address start = __ pc();
1513 
1514   Label L_thaw, L_exit;
1515 
1516   // i2i entry used at interp_only_mode only
1517   interpreted_entry_offset = __ pc() - start;
1518   {
1519 #ifdef ASSERT
1520     Label is_interp_only;
1521     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1522     __ jcc(Assembler::notEqual, is_interp_only);
1523     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1524     __ bind(is_interp_only);
1525 #endif
1526 
1527     __ pop(rax); // return address
1528     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1529     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1530     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1531     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1532     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1533     __ push(rax); // return address
1534     __ push_cont_fastpath();
1535 
1536     __ enter();
1537 
1538     stack_slots = 2; // will be adjusted in setup
1539     OopMap* map = continuation_enter_setup(masm, stack_slots);
1540     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1541     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1542 
1543     __ verify_oop(reg_cont_obj);
1544 
1545     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1546 
1547     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1548     __ testptr(reg_is_cont, reg_is_cont);
1549     __ jcc(Assembler::notZero, L_thaw);
1550 
1551     // --- Resolve path
1552 
1553     // Make sure the call is patchable
1554     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1555     // Emit stub for static call
1556     address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1557     if (stub == nullptr) {
1558       fatal("CodeCache is full at gen_continuation_enter");
1559     }
1560     __ call(resolve);
1561     oop_maps->add_gc_map(__ pc() - start, map);
1562     __ post_call_nop();
1563 
1564     __ jmp(L_exit);
1565   }
1566 
1567   // compiled entry
1568   __ align(CodeEntryAlignment);
1569   compiled_entry_offset = __ pc() - start;
1570   __ enter();
1571 
1572   stack_slots = 2; // will be adjusted in setup
1573   OopMap* map = continuation_enter_setup(masm, stack_slots);
1574 
1575   // Frame is now completed as far as size and linkage.
1576   frame_complete = __ pc() - start;
1577 
1578   __ verify_oop(reg_cont_obj);
1579 
1580   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1581 
1582   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1583   __ testptr(reg_is_cont, reg_is_cont);
1584   __ jccb(Assembler::notZero, L_thaw);
1585 
1586   // --- call Continuation.enter(Continuation c, boolean isContinue)
1587 
1588   // Make sure the call is patchable
1589   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1590 
1591   // Emit stub for static call
1592   address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1593   if (stub == nullptr) {
1594     fatal("CodeCache is full at gen_continuation_enter");
1595   }
1596 
1597   // The call needs to be resolved. There's a special case for this in
1598   // SharedRuntime::find_callee_info_helper() which calls
1599   // LinkResolver::resolve_continuation_enter() which resolves the call to
1600   // Continuation.enter(Continuation c, boolean isContinue).
1601   __ call(resolve);
1602 
1603   oop_maps->add_gc_map(__ pc() - start, map);
1604   __ post_call_nop();
1605 
1606   __ jmpb(L_exit);
1607 
1608   // --- Thawing path
1609 
1610   __ bind(L_thaw);
1611 
1612   ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1613   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1614 
1615   ContinuationEntry::_return_pc_offset = __ pc() - start;
1616   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1617   __ post_call_nop();
1618 
1619   // --- Normal exit (resolve/thawing)
1620 
1621   __ bind(L_exit);
1622   ContinuationEntry::_cleanup_offset = __ pc() - start;
1623   continuation_enter_cleanup(masm);
1624   __ pop(rbp);
1625   __ ret(0);
1626 
1627   // --- Exception handling path
1628 
1629   exception_offset = __ pc() - start;
1630 
1631   continuation_enter_cleanup(masm);
1632   __ pop(rbp);
1633 
1634   __ movptr(c_rarg0, r15_thread);
1635   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1636 
1637   // rax still holds the original exception oop, save it before the call
1638   __ push(rax);
1639 
1640   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1641   __ movptr(rbx, rax);
1642 
1643   // Continue at exception handler:
1644   //   rax: exception oop
1645   //   rbx: exception handler
1646   //   rdx: exception pc
1647   __ pop(rax);
1648   __ verify_oop(rax);
1649   __ pop(rdx);
1650   __ jmp(rbx);
1651 }
1652 
1653 static void gen_continuation_yield(MacroAssembler* masm,
1654                                    const VMRegPair* regs,
1655                                    OopMapSet* oop_maps,
1656                                    int& frame_complete,
1657                                    int& stack_slots,
1658                                    int& compiled_entry_offset) {
1659   enum layout {
1660     rbp_off,
1661     rbpH_off,
1662     return_off,
1663     return_off2,
1664     framesize // inclusive of return address
1665   };
1666   stack_slots = framesize /  VMRegImpl::slots_per_word;
1667   assert(stack_slots == 2, "recheck layout");
1668 
1669   address start = __ pc();
1670   compiled_entry_offset = __ pc() - start;
1671   __ enter();
1672   address the_pc = __ pc();
1673 
1674   frame_complete = the_pc - start;
1675 
1676   // This nop must be exactly at the PC we push into the frame info.
1677   // We use this nop for fast CodeBlob lookup, associate the OopMap
1678   // with it right away.
1679   __ post_call_nop();
1680   OopMap* map = new OopMap(framesize, 1);
1681   oop_maps->add_gc_map(frame_complete, map);
1682 
1683   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1684   __ movptr(c_rarg0, r15_thread);
1685   __ movptr(c_rarg1, rsp);
1686   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1687   __ reset_last_Java_frame(true);
1688 
1689   Label L_pinned;
1690 
1691   __ testptr(rax, rax);
1692   __ jcc(Assembler::notZero, L_pinned);
1693 
1694   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1695   continuation_enter_cleanup(masm);
1696   __ pop(rbp);
1697   __ ret(0);
1698 
1699   __ bind(L_pinned);
1700 
1701   // Pinned, return to caller
1702 
1703   // handle pending exception thrown by freeze
1704   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1705   Label ok;
1706   __ jcc(Assembler::equal, ok);
1707   __ leave();
1708   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1709   __ bind(ok);
1710 
1711   __ leave();
1712   __ ret(0);
1713 }
1714 
1715 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1716   ::continuation_enter_cleanup(masm);
1717 }
1718 
1719 static void gen_special_dispatch(MacroAssembler* masm,
1720                                  const methodHandle& method,
1721                                  const BasicType* sig_bt,
1722                                  const VMRegPair* regs) {
1723   verify_oop_args(masm, method, sig_bt, regs);
1724   vmIntrinsics::ID iid = method->intrinsic_id();
1725 
1726   // Now write the args into the outgoing interpreter space
1727   bool     has_receiver   = false;
1728   Register receiver_reg   = noreg;
1729   int      member_arg_pos = -1;
1730   Register member_reg     = noreg;
1731   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1732   if (ref_kind != 0) {
1733     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1734     member_reg = rbx;  // known to be free at this point
1735     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1736   } else if (iid == vmIntrinsics::_invokeBasic) {
1737     has_receiver = true;
1738   } else if (iid == vmIntrinsics::_linkToNative) {
1739     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1740     member_reg = rbx;  // known to be free at this point
1741   } else {
1742     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1743   }
1744 
1745   if (member_reg != noreg) {
1746     // Load the member_arg into register, if necessary.
1747     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1748     VMReg r = regs[member_arg_pos].first();
1749     if (r->is_stack()) {
1750       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1751     } else {
1752       // no data motion is needed
1753       member_reg = r->as_Register();
1754     }
1755   }
1756 
1757   if (has_receiver) {
1758     // Make sure the receiver is loaded into a register.
1759     assert(method->size_of_parameters() > 0, "oob");
1760     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1761     VMReg r = regs[0].first();
1762     assert(r->is_valid(), "bad receiver arg");
1763     if (r->is_stack()) {
1764       // Porting note:  This assumes that compiled calling conventions always
1765       // pass the receiver oop in a register.  If this is not true on some
1766       // platform, pick a temp and load the receiver from stack.
1767       fatal("receiver always in a register");
1768       receiver_reg = j_rarg0;  // known to be free at this point
1769       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1770     } else {
1771       // no data motion is needed
1772       receiver_reg = r->as_Register();
1773     }
1774   }
1775 
1776   // Figure out which address we are really jumping to:
1777   MethodHandles::generate_method_handle_dispatch(masm, iid,
1778                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1779 }
1780 
1781 // ---------------------------------------------------------------------------
1782 // Generate a native wrapper for a given method.  The method takes arguments
1783 // in the Java compiled code convention, marshals them to the native
1784 // convention (handlizes oops, etc), transitions to native, makes the call,
1785 // returns to java state (possibly blocking), unhandlizes any result and
1786 // returns.
1787 //
1788 // Critical native functions are a shorthand for the use of
1789 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1790 // functions.  The wrapper is expected to unpack the arguments before
1791 // passing them to the callee. Critical native functions leave the state _in_Java,
1792 // since they cannot stop for GC.
1793 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1794 // block and the check for pending exceptions it's impossible for them
1795 // to be thrown.
1796 //
1797 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1798                                                 const methodHandle& method,
1799                                                 int compile_id,
1800                                                 BasicType* in_sig_bt,
1801                                                 VMRegPair* in_regs,
1802                                                 BasicType ret_type) {
1803   if (method->is_continuation_native_intrinsic()) {
1804     int exception_offset = -1;
1805     OopMapSet* oop_maps = new OopMapSet();
1806     int frame_complete = -1;
1807     int stack_slots = -1;
1808     int interpreted_entry_offset = -1;
1809     int vep_offset = -1;
1810     if (method->is_continuation_enter_intrinsic()) {
1811       gen_continuation_enter(masm,
1812                              in_regs,
1813                              exception_offset,
1814                              oop_maps,
1815                              frame_complete,
1816                              stack_slots,
1817                              interpreted_entry_offset,
1818                              vep_offset);
1819     } else if (method->is_continuation_yield_intrinsic()) {
1820       gen_continuation_yield(masm,
1821                              in_regs,
1822                              oop_maps,
1823                              frame_complete,
1824                              stack_slots,
1825                              vep_offset);
1826     } else {
1827       guarantee(false, "Unknown Continuation native intrinsic");
1828     }
1829 
1830 #ifdef ASSERT
1831     if (method->is_continuation_enter_intrinsic()) {
1832       assert(interpreted_entry_offset != -1, "Must be set");
1833       assert(exception_offset != -1,         "Must be set");
1834     } else {
1835       assert(interpreted_entry_offset == -1, "Must be unset");
1836       assert(exception_offset == -1,         "Must be unset");
1837     }
1838     assert(frame_complete != -1,    "Must be set");
1839     assert(stack_slots != -1,       "Must be set");
1840     assert(vep_offset != -1,        "Must be set");
1841 #endif
1842 
1843     __ flush();
1844     nmethod* nm = nmethod::new_native_nmethod(method,
1845                                               compile_id,
1846                                               masm->code(),
1847                                               vep_offset,
1848                                               frame_complete,
1849                                               stack_slots,
1850                                               in_ByteSize(-1),
1851                                               in_ByteSize(-1),
1852                                               oop_maps,
1853                                               exception_offset);
1854     if (nm == nullptr) return nm;
1855     if (method->is_continuation_enter_intrinsic()) {
1856       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1857     } else if (method->is_continuation_yield_intrinsic()) {
1858       ContinuationEntry::set_yield_code(nm);
1859     }
1860     return nm;
1861   }
1862 
1863   if (method->is_method_handle_intrinsic()) {
1864     vmIntrinsics::ID iid = method->intrinsic_id();
1865     intptr_t start = (intptr_t)__ pc();
1866     int vep_offset = ((intptr_t)__ pc()) - start;
1867     gen_special_dispatch(masm,
1868                          method,
1869                          in_sig_bt,
1870                          in_regs);
1871     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1872     __ flush();
1873     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1874     return nmethod::new_native_nmethod(method,
1875                                        compile_id,
1876                                        masm->code(),
1877                                        vep_offset,
1878                                        frame_complete,
1879                                        stack_slots / VMRegImpl::slots_per_word,
1880                                        in_ByteSize(-1),
1881                                        in_ByteSize(-1),
1882                                        nullptr);
1883   }
1884   address native_func = method->native_function();
1885   assert(native_func != nullptr, "must have function");
1886 
1887   // An OopMap for lock (and class if static)
1888   OopMapSet *oop_maps = new OopMapSet();
1889   intptr_t start = (intptr_t)__ pc();
1890 
1891   // We have received a description of where all the java arg are located
1892   // on entry to the wrapper. We need to convert these args to where
1893   // the jni function will expect them. To figure out where they go
1894   // we convert the java signature to a C signature by inserting
1895   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1896 
1897   const int total_in_args = method->size_of_parameters();
1898   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1899 
1900   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1901   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1902 
1903   int argc = 0;
1904   out_sig_bt[argc++] = T_ADDRESS;
1905   if (method->is_static()) {
1906     out_sig_bt[argc++] = T_OBJECT;
1907   }
1908 
1909   for (int i = 0; i < total_in_args ; i++ ) {
1910     out_sig_bt[argc++] = in_sig_bt[i];
1911   }
1912 
1913   // Now figure out where the args must be stored and how much stack space
1914   // they require.
1915   int out_arg_slots;
1916   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
1917 
1918   // Compute framesize for the wrapper.  We need to handlize all oops in
1919   // incoming registers
1920 
1921   // Calculate the total number of stack slots we will need.
1922 
1923   // First count the abi requirement plus all of the outgoing args
1924   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1925 
1926   // Now the space for the inbound oop handle area
1927   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1928 
1929   int oop_handle_offset = stack_slots;
1930   stack_slots += total_save_slots;
1931 
1932   // Now any space we need for handlizing a klass if static method
1933 
1934   int klass_slot_offset = 0;
1935   int klass_offset = -1;
1936   int lock_slot_offset = 0;
1937   bool is_static = false;
1938 
1939   if (method->is_static()) {
1940     klass_slot_offset = stack_slots;
1941     stack_slots += VMRegImpl::slots_per_word;
1942     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1943     is_static = true;
1944   }
1945 
1946   // Plus a lock if needed
1947 
1948   if (method->is_synchronized()) {
1949     lock_slot_offset = stack_slots;
1950     stack_slots += VMRegImpl::slots_per_word;
1951   }
1952 
1953   // Now a place (+2) to save return values or temp during shuffling
1954   // + 4 for return address (which we own) and saved rbp
1955   stack_slots += 6;
1956 
1957   // Ok The space we have allocated will look like:
1958   //
1959   //
1960   // FP-> |                     |
1961   //      |---------------------|
1962   //      | 2 slots for moves   |
1963   //      |---------------------|
1964   //      | lock box (if sync)  |
1965   //      |---------------------| <- lock_slot_offset
1966   //      | klass (if static)   |
1967   //      |---------------------| <- klass_slot_offset
1968   //      | oopHandle area      |
1969   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1970   //      | outbound memory     |
1971   //      | based arguments     |
1972   //      |                     |
1973   //      |---------------------|
1974   //      |                     |
1975   // SP-> | out_preserved_slots |
1976   //
1977   //
1978 
1979 
1980   // Now compute actual number of stack words we need rounding to make
1981   // stack properly aligned.
1982   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1983 
1984   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1985 
1986   // First thing make an ic check to see if we should even be here
1987 
1988   // We are free to use all registers as temps without saving them and
1989   // restoring them except rbp. rbp is the only callee save register
1990   // as far as the interpreter and the compiler(s) are concerned.
1991 
1992   const Register receiver = j_rarg0;
1993 
1994   Label exception_pending;
1995 
1996   assert_different_registers(receiver, rscratch1, rscratch2);
1997   __ verify_oop(receiver);
1998   __ ic_check(8 /* end_alignment */);
1999 
2000   int vep_offset = ((intptr_t)__ pc()) - start;
2001 
2002   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
2003     Label L_skip_barrier;
2004     Register klass = r10;
2005     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
2006     __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
2007 
2008     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
2009 
2010     __ bind(L_skip_barrier);
2011   }
2012 
2013 #ifdef COMPILER1
2014   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2015   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2016     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2017   }
2018 #endif // COMPILER1
2019 
2020   // The instruction at the verified entry point must be 5 bytes or longer
2021   // because it can be patched on the fly by make_non_entrant. The stack bang
2022   // instruction fits that requirement.
2023 
2024   // Generate stack overflow check
2025   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
2026 
2027   // Generate a new frame for the wrapper.
2028   __ enter();
2029   // -2 because return address is already present and so is saved rbp
2030   __ subptr(rsp, stack_size - 2*wordSize);
2031 
2032   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2033   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
2034   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
2035 
2036   // Frame is now completed as far as size and linkage.
2037   int frame_complete = ((intptr_t)__ pc()) - start;
2038 
2039 #ifdef ASSERT
2040   __ check_stack_alignment(rsp, "improperly aligned stack");
2041 #endif /* ASSERT */
2042 
2043 
2044   // We use r14 as the oop handle for the receiver/klass
2045   // It is callee save so it survives the call to native
2046 
2047   const Register oop_handle_reg = r14;
2048 
2049   //
2050   // We immediately shuffle the arguments so that any vm call we have to
2051   // make from here on out (sync slow path, jvmti, etc.) we will have
2052   // captured the oops from our caller and have a valid oopMap for
2053   // them.
2054 
2055   // -----------------
2056   // The Grand Shuffle
2057 
2058   // The Java calling convention is either equal (linux) or denser (win64) than the
2059   // c calling convention. However the because of the jni_env argument the c calling
2060   // convention always has at least one more (and two for static) arguments than Java.
2061   // Therefore if we move the args from java -> c backwards then we will never have
2062   // a register->register conflict and we don't have to build a dependency graph
2063   // and figure out how to break any cycles.
2064   //
2065 
2066   // Record esp-based slot for receiver on stack for non-static methods
2067   int receiver_offset = -1;
2068 
2069   // This is a trick. We double the stack slots so we can claim
2070   // the oops in the caller's frame. Since we are sure to have
2071   // more args than the caller doubling is enough to make
2072   // sure we can capture all the incoming oop args from the
2073   // caller.
2074   //
2075   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2076 
2077   // Mark location of rbp (someday)
2078   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2079 
2080   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2081   // All inbound args are referenced based on rbp and all outbound args via rsp.
2082 
2083 
2084 #ifdef ASSERT
2085   bool reg_destroyed[Register::number_of_registers];
2086   bool freg_destroyed[XMMRegister::number_of_registers];
2087   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2088     reg_destroyed[r] = false;
2089   }
2090   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2091     freg_destroyed[f] = false;
2092   }
2093 
2094 #endif /* ASSERT */
2095 
2096   // For JNI natives the incoming and outgoing registers are offset upwards.
2097   GrowableArray<int> arg_order(2 * total_in_args);
2098 
2099   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2100     arg_order.push(i);
2101     arg_order.push(c_arg);
2102   }
2103 
2104   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2105     int i = arg_order.at(ai);
2106     int c_arg = arg_order.at(ai + 1);
2107     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2108 #ifdef ASSERT
2109     if (in_regs[i].first()->is_Register()) {
2110       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2111     } else if (in_regs[i].first()->is_XMMRegister()) {
2112       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2113     }
2114     if (out_regs[c_arg].first()->is_Register()) {
2115       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2116     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2117       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2118     }
2119 #endif /* ASSERT */
2120     switch (in_sig_bt[i]) {
2121       case T_ARRAY:
2122       case T_OBJECT:
2123         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2124                     ((i == 0) && (!is_static)),
2125                     &receiver_offset);
2126         break;
2127       case T_VOID:
2128         break;
2129 
2130       case T_FLOAT:
2131         __ float_move(in_regs[i], out_regs[c_arg]);
2132           break;
2133 
2134       case T_DOUBLE:
2135         assert( i + 1 < total_in_args &&
2136                 in_sig_bt[i + 1] == T_VOID &&
2137                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2138         __ double_move(in_regs[i], out_regs[c_arg]);
2139         break;
2140 
2141       case T_LONG :
2142         __ long_move(in_regs[i], out_regs[c_arg]);
2143         break;
2144 
2145       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2146 
2147       default:
2148         __ move32_64(in_regs[i], out_regs[c_arg]);
2149     }
2150   }
2151 
2152   int c_arg;
2153 
2154   // Pre-load a static method's oop into r14.  Used both by locking code and
2155   // the normal JNI call code.
2156   // point c_arg at the first arg that is already loaded in case we
2157   // need to spill before we call out
2158   c_arg = total_c_args - total_in_args;
2159 
2160   if (method->is_static()) {
2161 
2162     //  load oop into a register
2163     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2164 
2165     // Now handlize the static class mirror it's known not-null.
2166     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2167     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2168 
2169     // Now get the handle
2170     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2171     // store the klass handle as second argument
2172     __ movptr(c_rarg1, oop_handle_reg);
2173     // and protect the arg if we must spill
2174     c_arg--;
2175   }
2176 
2177   // Change state to native (we save the return address in the thread, since it might not
2178   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2179   // points into the right code segment. It does not have to be the correct return pc.
2180   // We use the same pc/oopMap repeatedly when we call out
2181 
2182   Label native_return;
2183   if (LockingMode != LM_LEGACY && method->is_object_wait0()) {
2184     // For convenience we use the pc we want to resume to in case of preemption on Object.wait.
2185     __ set_last_Java_frame(rsp, noreg, native_return, rscratch1);
2186   } else {
2187     intptr_t the_pc = (intptr_t) __ pc();
2188     oop_maps->add_gc_map(the_pc - start, map);
2189 
2190     __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1);
2191   }
2192 
2193   // We have all of the arguments setup at this point. We must not touch any register
2194   // argument registers at this point (what if we save/restore them there are no oop?
2195 
2196   if (DTraceMethodProbes) {
2197     // protect the args we've loaded
2198     save_args(masm, total_c_args, c_arg, out_regs);
2199     __ mov_metadata(c_rarg1, method());
2200     __ call_VM_leaf(
2201       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2202       r15_thread, c_rarg1);
2203     restore_args(masm, total_c_args, c_arg, out_regs);
2204   }
2205 
2206   // RedefineClasses() tracing support for obsolete method entry
2207   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2208     // protect the args we've loaded
2209     save_args(masm, total_c_args, c_arg, out_regs);
2210     __ mov_metadata(c_rarg1, method());
2211     __ call_VM_leaf(
2212       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2213       r15_thread, c_rarg1);
2214     restore_args(masm, total_c_args, c_arg, out_regs);
2215   }
2216 
2217   // Lock a synchronized method
2218 
2219   // Register definitions used by locking and unlocking
2220 
2221   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2222   const Register obj_reg  = rbx;  // Will contain the oop
2223   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2224   const Register old_hdr  = r13;  // value of old header at unlock time
2225 
2226   Label slow_path_lock;
2227   Label lock_done;
2228 
2229   if (method->is_synchronized()) {
2230     Label count_mon;
2231 
2232     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2233 
2234     // Get the handle (the 2nd argument)
2235     __ mov(oop_handle_reg, c_rarg1);
2236 
2237     // Get address of the box
2238 
2239     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2240 
2241     // Load the oop from the handle
2242     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2243 
2244     if (LockingMode == LM_MONITOR) {
2245       __ jmp(slow_path_lock);
2246     } else if (LockingMode == LM_LEGACY) {
2247       // Load immediate 1 into swap_reg %rax
2248       __ movl(swap_reg, 1);
2249 
2250       // Load (object->mark() | 1) into swap_reg %rax
2251       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2252 
2253       // Save (object->mark() | 1) into BasicLock's displaced header
2254       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2255 
2256       // src -> dest iff dest == rax else rax <- dest
2257       __ lock();
2258       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2259       __ jcc(Assembler::equal, count_mon);
2260 
2261       // Hmm should this move to the slow path code area???
2262 
2263       // Test if the oopMark is an obvious stack pointer, i.e.,
2264       //  1) (mark & 3) == 0, and
2265       //  2) rsp <= mark < mark + os::pagesize()
2266       // These 3 tests can be done by evaluating the following
2267       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2268       // assuming both stack pointer and pagesize have their
2269       // least significant 2 bits clear.
2270       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2271 
2272       __ subptr(swap_reg, rsp);
2273       __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2274 
2275       // Save the test result, for recursive case, the result is zero
2276       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2277       __ jcc(Assembler::notEqual, slow_path_lock);
2278 
2279       __ bind(count_mon);
2280       __ inc_held_monitor_count();
2281     } else {
2282       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2283       __ lightweight_lock(lock_reg, obj_reg, swap_reg, rscratch1, slow_path_lock);
2284     }
2285 
2286     // Slow path will re-enter here
2287     __ bind(lock_done);
2288   }
2289 
2290   // Finally just about ready to make the JNI call
2291 
2292   // get JNIEnv* which is first argument to native
2293   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2294 
2295   // Now set thread in native
2296   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2297 
2298   __ call(RuntimeAddress(native_func));
2299 
2300   // Verify or restore cpu control state after JNI call
2301   __ restore_cpu_control_state_after_jni(rscratch1);
2302 
2303   // Unpack native results.
2304   switch (ret_type) {
2305   case T_BOOLEAN: __ c2bool(rax);            break;
2306   case T_CHAR   : __ movzwl(rax, rax);      break;
2307   case T_BYTE   : __ sign_extend_byte (rax); break;
2308   case T_SHORT  : __ sign_extend_short(rax); break;
2309   case T_INT    : /* nothing to do */        break;
2310   case T_DOUBLE :
2311   case T_FLOAT  :
2312     // Result is in xmm0 we'll save as needed
2313     break;
2314   case T_ARRAY:                 // Really a handle
2315   case T_OBJECT:                // Really a handle
2316       break; // can't de-handlize until after safepoint check
2317   case T_VOID: break;
2318   case T_LONG: break;
2319   default       : ShouldNotReachHere();
2320   }
2321 
2322   // Switch thread to "native transition" state before reading the synchronization state.
2323   // This additional state is necessary because reading and testing the synchronization
2324   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2325   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2326   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2327   //     Thread A is resumed to finish this native method, but doesn't block here since it
2328   //     didn't see any synchronization is progress, and escapes.
2329   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2330 
2331   // Force this write out before the read below
2332   if (!UseSystemMemoryBarrier) {
2333     __ membar(Assembler::Membar_mask_bits(
2334               Assembler::LoadLoad | Assembler::LoadStore |
2335               Assembler::StoreLoad | Assembler::StoreStore));
2336   }
2337 
2338   // check for safepoint operation in progress and/or pending suspend requests
2339   {
2340     Label Continue;
2341     Label slow_path;
2342 
2343     __ safepoint_poll(slow_path, true /* at_return */, false /* in_nmethod */);
2344 
2345     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2346     __ jcc(Assembler::equal, Continue);
2347     __ bind(slow_path);
2348 
2349     // Don't use call_VM as it will see a possible pending exception and forward it
2350     // and never return here preventing us from clearing _last_native_pc down below.
2351     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2352     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2353     // by hand.
2354     //
2355     __ vzeroupper();
2356     save_native_result(masm, ret_type, stack_slots);
2357     __ mov(c_rarg0, r15_thread);
2358     __ mov(r12, rsp); // remember sp
2359     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2360     __ andptr(rsp, -16); // align stack as required by ABI
2361     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2362     __ mov(rsp, r12); // restore sp
2363     __ reinit_heapbase();
2364     // Restore any method result value
2365     restore_native_result(masm, ret_type, stack_slots);
2366     __ bind(Continue);
2367   }
2368 
2369   // change thread state
2370   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2371 
2372   if (LockingMode != LM_LEGACY && method->is_object_wait0()) {
2373     // Check preemption for Object.wait()
2374     __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset()));
2375     __ cmpptr(rscratch1, NULL_WORD);
2376     __ jccb(Assembler::equal, native_return);
2377     __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD);
2378     __ jmp(rscratch1);
2379     __ bind(native_return);
2380 
2381     intptr_t the_pc = (intptr_t) __ pc();
2382     oop_maps->add_gc_map(the_pc - start, map);
2383   }
2384 
2385 
2386   Label reguard;
2387   Label reguard_done;
2388   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2389   __ jcc(Assembler::equal, reguard);
2390   __ bind(reguard_done);
2391 
2392   // native result if any is live
2393 
2394   // Unlock
2395   Label slow_path_unlock;
2396   Label unlock_done;
2397   if (method->is_synchronized()) {
2398 
2399     Label fast_done;
2400 
2401     // Get locked oop from the handle we passed to jni
2402     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2403 
2404     if (LockingMode == LM_LEGACY) {
2405       Label not_recur;
2406       // Simple recursive lock?
2407       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2408       __ jcc(Assembler::notEqual, not_recur);
2409       __ dec_held_monitor_count();
2410       __ jmpb(fast_done);
2411       __ bind(not_recur);
2412     }
2413 
2414     // Must save rax if it is live now because cmpxchg must use it
2415     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2416       save_native_result(masm, ret_type, stack_slots);
2417     }
2418 
2419     if (LockingMode == LM_MONITOR) {
2420       __ jmp(slow_path_unlock);
2421     } else if (LockingMode == LM_LEGACY) {
2422       // get address of the stack lock
2423       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2424       //  get old displaced header
2425       __ movptr(old_hdr, Address(rax, 0));
2426 
2427       // Atomic swap old header if oop still contains the stack lock
2428       __ lock();
2429       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2430       __ jcc(Assembler::notEqual, slow_path_unlock);
2431       __ dec_held_monitor_count();
2432     } else {
2433       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2434       __ lightweight_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2435     }
2436 
2437     // slow path re-enters here
2438     __ bind(unlock_done);
2439     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2440       restore_native_result(masm, ret_type, stack_slots);
2441     }
2442 
2443     __ bind(fast_done);
2444   }
2445   if (DTraceMethodProbes) {
2446     save_native_result(masm, ret_type, stack_slots);
2447     __ mov_metadata(c_rarg1, method());
2448     __ call_VM_leaf(
2449          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2450          r15_thread, c_rarg1);
2451     restore_native_result(masm, ret_type, stack_slots);
2452   }
2453 
2454   __ reset_last_Java_frame(false);
2455 
2456   // Unbox oop result, e.g. JNIHandles::resolve value.
2457   if (is_reference_type(ret_type)) {
2458     __ resolve_jobject(rax /* value */,
2459                        rcx /* tmp */);
2460   }
2461 
2462   if (CheckJNICalls) {
2463     // clear_pending_jni_exception_check
2464     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2465   }
2466 
2467   // reset handle block
2468   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2469   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2470 
2471   // pop our frame
2472 
2473   __ leave();
2474 
2475   // Any exception pending?
2476   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2477   __ jcc(Assembler::notEqual, exception_pending);
2478 
2479   // Return
2480 
2481   __ ret(0);
2482 
2483   // Unexpected paths are out of line and go here
2484 
2485   // forward the exception
2486   __ bind(exception_pending);
2487 
2488   // and forward the exception
2489   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2490 
2491   // Slow path locking & unlocking
2492   if (method->is_synchronized()) {
2493 
2494     // BEGIN Slow path lock
2495     __ bind(slow_path_lock);
2496 
2497     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2498     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2499 
2500     // protect the args we've loaded
2501     save_args(masm, total_c_args, c_arg, out_regs);
2502 
2503     __ mov(c_rarg0, obj_reg);
2504     __ mov(c_rarg1, lock_reg);
2505     __ mov(c_rarg2, r15_thread);
2506 
2507     // Not a leaf but we have last_Java_frame setup as we want.
2508     // We don't want to unmount in case of contention since that would complicate preserving
2509     // the arguments that had already been marshalled into the native convention. So we force
2510     // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame())
2511     // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack.
2512     __ push_cont_fastpath();
2513     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2514     __ pop_cont_fastpath();
2515     restore_args(masm, total_c_args, c_arg, out_regs);
2516 
2517 #ifdef ASSERT
2518     { Label L;
2519     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2520     __ jcc(Assembler::equal, L);
2521     __ stop("no pending exception allowed on exit from monitorenter");
2522     __ bind(L);
2523     }
2524 #endif
2525     __ jmp(lock_done);
2526 
2527     // END Slow path lock
2528 
2529     // BEGIN Slow path unlock
2530     __ bind(slow_path_unlock);
2531 
2532     // If we haven't already saved the native result we must save it now as xmm registers
2533     // are still exposed.
2534     __ vzeroupper();
2535     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2536       save_native_result(masm, ret_type, stack_slots);
2537     }
2538 
2539     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2540 
2541     __ mov(c_rarg0, obj_reg);
2542     __ mov(c_rarg2, r15_thread);
2543     __ mov(r12, rsp); // remember sp
2544     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2545     __ andptr(rsp, -16); // align stack as required by ABI
2546 
2547     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2548     // NOTE that obj_reg == rbx currently
2549     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2550     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2551 
2552     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2553     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2554     __ mov(rsp, r12); // restore sp
2555     __ reinit_heapbase();
2556 #ifdef ASSERT
2557     {
2558       Label L;
2559       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2560       __ jcc(Assembler::equal, L);
2561       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2562       __ bind(L);
2563     }
2564 #endif /* ASSERT */
2565 
2566     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2567 
2568     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2569       restore_native_result(masm, ret_type, stack_slots);
2570     }
2571     __ jmp(unlock_done);
2572 
2573     // END Slow path unlock
2574 
2575   } // synchronized
2576 
2577   // SLOW PATH Reguard the stack if needed
2578 
2579   __ bind(reguard);
2580   __ vzeroupper();
2581   save_native_result(masm, ret_type, stack_slots);
2582   __ mov(r12, rsp); // remember sp
2583   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2584   __ andptr(rsp, -16); // align stack as required by ABI
2585   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2586   __ mov(rsp, r12); // restore sp
2587   __ reinit_heapbase();
2588   restore_native_result(masm, ret_type, stack_slots);
2589   // and continue
2590   __ jmp(reguard_done);
2591 
2592 
2593 
2594   __ flush();
2595 
2596   nmethod *nm = nmethod::new_native_nmethod(method,
2597                                             compile_id,
2598                                             masm->code(),
2599                                             vep_offset,
2600                                             frame_complete,
2601                                             stack_slots / VMRegImpl::slots_per_word,
2602                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2603                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2604                                             oop_maps);
2605 
2606   return nm;
2607 }
2608 
2609 // this function returns the adjust size (in number of words) to a c2i adapter
2610 // activation for use during deoptimization
2611 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2612   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2613 }
2614 
2615 
2616 uint SharedRuntime::out_preserve_stack_slots() {
2617   return 0;
2618 }
2619 
2620 
2621 // Number of stack slots between incoming argument block and the start of
2622 // a new frame.  The PROLOG must add this many slots to the stack.  The
2623 // EPILOG must remove this many slots.  amd64 needs two slots for
2624 // return address.
2625 uint SharedRuntime::in_preserve_stack_slots() {
2626   return 4 + 2 * VerifyStackAtCalls;
2627 }
2628 
2629 VMReg SharedRuntime::thread_register() {
2630   return r15_thread->as_VMReg();
2631 }
2632 
2633 //------------------------------generate_deopt_blob----------------------------
2634 void SharedRuntime::generate_deopt_blob() {
2635   // Allocate space for the code
2636   ResourceMark rm;
2637   // Setup code generation tools
2638   int pad = 0;
2639   if (UseAVX > 2) {
2640     pad += 1024;
2641   }
2642   if (UseAPX) {
2643     pad += 1024;
2644   }
2645 #if INCLUDE_JVMCI
2646   if (EnableJVMCI) {
2647     pad += 512; // Increase the buffer size when compiling for JVMCI
2648   }
2649 #endif
2650   const char* name = SharedRuntime::stub_name(SharedStubId::deopt_id);
2651   CodeBuffer buffer(name, 2560+pad, 1024);
2652   MacroAssembler* masm = new MacroAssembler(&buffer);
2653   int frame_size_in_words;
2654   OopMap* map = nullptr;
2655   OopMapSet *oop_maps = new OopMapSet();
2656 
2657   // -------------
2658   // This code enters when returning to a de-optimized nmethod.  A return
2659   // address has been pushed on the stack, and return values are in
2660   // registers.
2661   // If we are doing a normal deopt then we were called from the patched
2662   // nmethod from the point we returned to the nmethod. So the return
2663   // address on the stack is wrong by NativeCall::instruction_size
2664   // We will adjust the value so it looks like we have the original return
2665   // address on the stack (like when we eagerly deoptimized).
2666   // In the case of an exception pending when deoptimizing, we enter
2667   // with a return address on the stack that points after the call we patched
2668   // into the exception handler. We have the following register state from,
2669   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2670   //    rax: exception oop
2671   //    rbx: exception handler
2672   //    rdx: throwing pc
2673   // So in this case we simply jam rdx into the useless return address and
2674   // the stack looks just like we want.
2675   //
2676   // At this point we need to de-opt.  We save the argument return
2677   // registers.  We call the first C routine, fetch_unroll_info().  This
2678   // routine captures the return values and returns a structure which
2679   // describes the current frame size and the sizes of all replacement frames.
2680   // The current frame is compiled code and may contain many inlined
2681   // functions, each with their own JVM state.  We pop the current frame, then
2682   // push all the new frames.  Then we call the C routine unpack_frames() to
2683   // populate these frames.  Finally unpack_frames() returns us the new target
2684   // address.  Notice that callee-save registers are BLOWN here; they have
2685   // already been captured in the vframeArray at the time the return PC was
2686   // patched.
2687   address start = __ pc();
2688   Label cont;
2689 
2690   // Prolog for non exception case!
2691 
2692   // Save everything in sight.
2693   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2694 
2695   // Normal deoptimization.  Save exec mode for unpack_frames.
2696   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2697   __ jmp(cont);
2698 
2699   int reexecute_offset = __ pc() - start;
2700 #if INCLUDE_JVMCI && !defined(COMPILER1)
2701   if (UseJVMCICompiler) {
2702     // JVMCI does not use this kind of deoptimization
2703     __ should_not_reach_here();
2704   }
2705 #endif
2706 
2707   // Reexecute case
2708   // return address is the pc describes what bci to do re-execute at
2709 
2710   // No need to update map as each call to save_live_registers will produce identical oopmap
2711   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2712 
2713   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2714   __ jmp(cont);
2715 
2716 #if INCLUDE_JVMCI
2717   Label after_fetch_unroll_info_call;
2718   int implicit_exception_uncommon_trap_offset = 0;
2719   int uncommon_trap_offset = 0;
2720 
2721   if (EnableJVMCI) {
2722     implicit_exception_uncommon_trap_offset = __ pc() - start;
2723 
2724     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2725     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2726 
2727     uncommon_trap_offset = __ pc() - start;
2728 
2729     // Save everything in sight.
2730     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2731     // fetch_unroll_info needs to call last_java_frame()
2732     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2733 
2734     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2735     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2736 
2737     __ movl(r14, Deoptimization::Unpack_reexecute);
2738     __ mov(c_rarg0, r15_thread);
2739     __ movl(c_rarg2, r14); // exec mode
2740     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2741     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2742 
2743     __ reset_last_Java_frame(false);
2744 
2745     __ jmp(after_fetch_unroll_info_call);
2746   } // EnableJVMCI
2747 #endif // INCLUDE_JVMCI
2748 
2749   int exception_offset = __ pc() - start;
2750 
2751   // Prolog for exception case
2752 
2753   // all registers are dead at this entry point, except for rax, and
2754   // rdx which contain the exception oop and exception pc
2755   // respectively.  Set them in TLS and fall thru to the
2756   // unpack_with_exception_in_tls entry point.
2757 
2758   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2759   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2760 
2761   int exception_in_tls_offset = __ pc() - start;
2762 
2763   // new implementation because exception oop is now passed in JavaThread
2764 
2765   // Prolog for exception case
2766   // All registers must be preserved because they might be used by LinearScan
2767   // Exceptiop oop and throwing PC are passed in JavaThread
2768   // tos: stack at point of call to method that threw the exception (i.e. only
2769   // args are on the stack, no return address)
2770 
2771   // make room on stack for the return address
2772   // It will be patched later with the throwing pc. The correct value is not
2773   // available now because loading it from memory would destroy registers.
2774   __ push(0);
2775 
2776   // Save everything in sight.
2777   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2778 
2779   // Now it is safe to overwrite any register
2780 
2781   // Deopt during an exception.  Save exec mode for unpack_frames.
2782   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2783 
2784   // load throwing pc from JavaThread and patch it as the return address
2785   // of the current frame. Then clear the field in JavaThread
2786 
2787   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2788   __ movptr(Address(rbp, wordSize), rdx);
2789   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2790 
2791 #ifdef ASSERT
2792   // verify that there is really an exception oop in JavaThread
2793   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2794   __ verify_oop(rax);
2795 
2796   // verify that there is no pending exception
2797   Label no_pending_exception;
2798   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2799   __ testptr(rax, rax);
2800   __ jcc(Assembler::zero, no_pending_exception);
2801   __ stop("must not have pending exception here");
2802   __ bind(no_pending_exception);
2803 #endif
2804 
2805   __ bind(cont);
2806 
2807   // Call C code.  Need thread and this frame, but NOT official VM entry
2808   // crud.  We cannot block on this call, no GC can happen.
2809   //
2810   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2811 
2812   // fetch_unroll_info needs to call last_java_frame().
2813 
2814   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2815 #ifdef ASSERT
2816   { Label L;
2817     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2818     __ jcc(Assembler::equal, L);
2819     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2820     __ bind(L);
2821   }
2822 #endif // ASSERT
2823   __ mov(c_rarg0, r15_thread);
2824   __ movl(c_rarg1, r14); // exec_mode
2825   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2826 
2827   // Need to have an oopmap that tells fetch_unroll_info where to
2828   // find any register it might need.
2829   oop_maps->add_gc_map(__ pc() - start, map);
2830 
2831   __ reset_last_Java_frame(false);
2832 
2833 #if INCLUDE_JVMCI
2834   if (EnableJVMCI) {
2835     __ bind(after_fetch_unroll_info_call);
2836   }
2837 #endif
2838 
2839   // Load UnrollBlock* into rdi
2840   __ mov(rdi, rax);
2841 
2842   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2843    Label noException;
2844   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2845   __ jcc(Assembler::notEqual, noException);
2846   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2847   // QQQ this is useless it was null above
2848   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2849   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2850   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2851 
2852   __ verify_oop(rax);
2853 
2854   // Overwrite the result registers with the exception results.
2855   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2856   // I think this is useless
2857   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2858 
2859   __ bind(noException);
2860 
2861   // Only register save data is on the stack.
2862   // Now restore the result registers.  Everything else is either dead
2863   // or captured in the vframeArray.
2864   RegisterSaver::restore_result_registers(masm);
2865 
2866   // All of the register save area has been popped of the stack. Only the
2867   // return address remains.
2868 
2869   // Pop all the frames we must move/replace.
2870   //
2871   // Frame picture (youngest to oldest)
2872   // 1: self-frame (no frame link)
2873   // 2: deopting frame  (no frame link)
2874   // 3: caller of deopting frame (could be compiled/interpreted).
2875   //
2876   // Note: by leaving the return address of self-frame on the stack
2877   // and using the size of frame 2 to adjust the stack
2878   // when we are done the return to frame 3 will still be on the stack.
2879 
2880   // Pop deoptimized frame
2881   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2882   __ addptr(rsp, rcx);
2883 
2884   // rsp should be pointing at the return address to the caller (3)
2885 
2886   // Pick up the initial fp we should save
2887   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2888   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2889 
2890 #ifdef ASSERT
2891   // Compilers generate code that bang the stack by as much as the
2892   // interpreter would need. So this stack banging should never
2893   // trigger a fault. Verify that it does not on non product builds.
2894   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2895   __ bang_stack_size(rbx, rcx);
2896 #endif
2897 
2898   // Load address of array of frame pcs into rcx
2899   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2900 
2901   // Trash the old pc
2902   __ addptr(rsp, wordSize);
2903 
2904   // Load address of array of frame sizes into rsi
2905   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2906 
2907   // Load counter into rdx
2908   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2909 
2910   // Now adjust the caller's stack to make up for the extra locals
2911   // but record the original sp so that we can save it in the skeletal interpreter
2912   // frame and the stack walking of interpreter_sender will get the unextended sp
2913   // value and not the "real" sp value.
2914 
2915   const Register sender_sp = r8;
2916 
2917   __ mov(sender_sp, rsp);
2918   __ movl(rbx, Address(rdi,
2919                        Deoptimization::UnrollBlock::
2920                        caller_adjustment_offset()));
2921   __ subptr(rsp, rbx);
2922 
2923   // Push interpreter frames in a loop
2924   Label loop;
2925   __ bind(loop);
2926   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2927   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2928   __ pushptr(Address(rcx, 0));          // Save return address
2929   __ enter();                           // Save old & set new ebp
2930   __ subptr(rsp, rbx);                  // Prolog
2931   // This value is corrected by layout_activation_impl
2932   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2933   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2934   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2935   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2936   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2937   __ decrementl(rdx);                   // Decrement counter
2938   __ jcc(Assembler::notZero, loop);
2939   __ pushptr(Address(rcx, 0));          // Save final return address
2940 
2941   // Re-push self-frame
2942   __ enter();                           // Save old & set new ebp
2943 
2944   // Allocate a full sized register save area.
2945   // Return address and rbp are in place, so we allocate two less words.
2946   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2947 
2948   // Restore frame locals after moving the frame
2949   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2950   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2951 
2952   // Call C code.  Need thread but NOT official VM entry
2953   // crud.  We cannot block on this call, no GC can happen.  Call should
2954   // restore return values to their stack-slots with the new SP.
2955   //
2956   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2957 
2958   // Use rbp because the frames look interpreted now
2959   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2960   // Don't need the precise return PC here, just precise enough to point into this code blob.
2961   address the_pc = __ pc();
2962   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2963 
2964   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2965   __ mov(c_rarg0, r15_thread);
2966   __ movl(c_rarg1, r14); // second arg: exec_mode
2967   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2968   // Revert SP alignment after call since we're going to do some SP relative addressing below
2969   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2970 
2971   // Set an oopmap for the call site
2972   // Use the same PC we used for the last java frame
2973   oop_maps->add_gc_map(the_pc - start,
2974                        new OopMap( frame_size_in_words, 0 ));
2975 
2976   // Clear fp AND pc
2977   __ reset_last_Java_frame(true);
2978 
2979   // Collect return values
2980   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2981   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2982   // I think this is useless (throwing pc?)
2983   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2984 
2985   // Pop self-frame.
2986   __ leave();                           // Epilog
2987 
2988   // Jump to interpreter
2989   __ ret(0);
2990 
2991   // Make sure all code is generated
2992   masm->flush();
2993 
2994   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2995   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2996 #if INCLUDE_JVMCI
2997   if (EnableJVMCI) {
2998     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2999     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
3000   }
3001 #endif
3002 }
3003 
3004 //------------------------------generate_handler_blob------
3005 //
3006 // Generate a special Compile2Runtime blob that saves all registers,
3007 // and setup oopmap.
3008 //
3009 SafepointBlob* SharedRuntime::generate_handler_blob(SharedStubId id, address call_ptr) {
3010   assert(StubRoutines::forward_exception_entry() != nullptr,
3011          "must be generated before");
3012   assert(is_polling_page_id(id), "expected a polling page stub id");
3013 
3014   ResourceMark rm;
3015   OopMapSet *oop_maps = new OopMapSet();
3016   OopMap* map;
3017 
3018   // Allocate space for the code.  Setup code generation tools.
3019   const char* name = SharedRuntime::stub_name(id);
3020   CodeBuffer buffer(name, 2548, 1024);
3021   MacroAssembler* masm = new MacroAssembler(&buffer);
3022 
3023   address start   = __ pc();
3024   address call_pc = nullptr;
3025   int frame_size_in_words;
3026   bool cause_return = (id == SharedStubId::polling_page_return_handler_id);
3027   bool save_wide_vectors = (id == SharedStubId::polling_page_vectors_safepoint_handler_id);
3028 
3029   // Make room for return address (or push it again)
3030   if (!cause_return) {
3031     __ push(rbx);
3032   }
3033 
3034   // Save registers, fpu state, and flags
3035   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3036 
3037   // The following is basically a call_VM.  However, we need the precise
3038   // address of the call in order to generate an oopmap. Hence, we do all the
3039   // work ourselves.
3040 
3041   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3042 
3043   // The return address must always be correct so that frame constructor never
3044   // sees an invalid pc.
3045 
3046   if (!cause_return) {
3047     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3048     // Additionally, rbx is a callee saved register and we can look at it later to determine
3049     // if someone changed the return address for us!
3050     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3051     __ movptr(Address(rbp, wordSize), rbx);
3052   }
3053 
3054   // Do the call
3055   __ mov(c_rarg0, r15_thread);
3056   __ call(RuntimeAddress(call_ptr));
3057 
3058   // Set an oopmap for the call site.  This oopmap will map all
3059   // oop-registers and debug-info registers as callee-saved.  This
3060   // will allow deoptimization at this safepoint to find all possible
3061   // debug-info recordings, as well as let GC find all oops.
3062 
3063   oop_maps->add_gc_map( __ pc() - start, map);
3064 
3065   Label noException;
3066 
3067   __ reset_last_Java_frame(false);
3068 
3069   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3070   __ jcc(Assembler::equal, noException);
3071 
3072   // Exception pending
3073 
3074   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3075 
3076   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3077 
3078   // No exception case
3079   __ bind(noException);
3080 
3081   Label no_adjust;
3082 #ifdef ASSERT
3083   Label bail;
3084 #endif
3085   if (!cause_return) {
3086     Label no_prefix, not_special, check_rex_prefix;
3087 
3088     // If our stashed return pc was modified by the runtime we avoid touching it
3089     __ cmpptr(rbx, Address(rbp, wordSize));
3090     __ jcc(Assembler::notEqual, no_adjust);
3091 
3092     // Skip over the poll instruction.
3093     // See NativeInstruction::is_safepoint_poll()
3094     // Possible encodings:
3095     //      85 00       test   %eax,(%rax)
3096     //      85 01       test   %eax,(%rcx)
3097     //      85 02       test   %eax,(%rdx)
3098     //      85 03       test   %eax,(%rbx)
3099     //      85 06       test   %eax,(%rsi)
3100     //      85 07       test   %eax,(%rdi)
3101     //
3102     //   41 85 00       test   %eax,(%r8)
3103     //   41 85 01       test   %eax,(%r9)
3104     //   41 85 02       test   %eax,(%r10)
3105     //   41 85 03       test   %eax,(%r11)
3106     //   41 85 06       test   %eax,(%r14)
3107     //   41 85 07       test   %eax,(%r15)
3108     //
3109     //      85 04 24    test   %eax,(%rsp)
3110     //   41 85 04 24    test   %eax,(%r12)
3111     //      85 45 00    test   %eax,0x0(%rbp)
3112     //   41 85 45 00    test   %eax,0x0(%r13)
3113     //
3114     // Notes:
3115     //  Format of legacy MAP0 test instruction:-
3116     //  [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32]
3117     //  o  For safepoint polling instruction "test %eax,(%rax)", encoding of first register
3118     //     operand and base register of memory operand is b/w [0-8), hence we do not require
3119     //     additional REX prefix where REX.B bit stores MSB bit of register encoding, which
3120     //     is why two bytes encoding is sufficient here.
3121     //  o  For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE
3122     //     register of memory operand is 1000, thus we need additional REX prefix in this case,
3123     //     there by adding additional byte to instruction encoding.
3124     //  o  In case BASE register is one of the 32 extended GPR registers available only on targets
3125     //     supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold
3126     //     most significant two bits of 5 bit register encoding.
3127 
3128     if (VM_Version::supports_apx_f()) {
3129       __ cmpb(Address(rbx, 0), Assembler::REX2);
3130       __ jccb(Assembler::notEqual, check_rex_prefix);
3131       __ addptr(rbx, 2);
3132       __ bind(check_rex_prefix);
3133     }
3134     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3135     __ jccb(Assembler::notEqual, no_prefix);
3136     __ addptr(rbx, 1);
3137     __ bind(no_prefix);
3138 #ifdef ASSERT
3139     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3140 #endif
3141     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3142     // r12/rsp 0x04
3143     // r13/rbp 0x05
3144     __ movzbq(rcx, Address(rbx, 1));
3145     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3146     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3147     __ cmpptr(rcx, 1);
3148     __ jccb(Assembler::above, not_special);
3149     __ addptr(rbx, 1);
3150     __ bind(not_special);
3151 #ifdef ASSERT
3152     // Verify the correct encoding of the poll we're about to skip.
3153     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3154     __ jcc(Assembler::notEqual, bail);
3155     // Mask out the modrm bits
3156     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3157     // rax encodes to 0, so if the bits are nonzero it's incorrect
3158     __ jcc(Assembler::notZero, bail);
3159 #endif
3160     // Adjust return pc forward to step over the safepoint poll instruction
3161     __ addptr(rbx, 2);
3162     __ movptr(Address(rbp, wordSize), rbx);
3163   }
3164 
3165   __ bind(no_adjust);
3166   // Normal exit, restore registers and exit.
3167   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3168   __ ret(0);
3169 
3170 #ifdef ASSERT
3171   __ bind(bail);
3172   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3173 #endif
3174 
3175   // Make sure all code is generated
3176   masm->flush();
3177 
3178   // Fill-out other meta info
3179   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3180 }
3181 
3182 //
3183 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3184 //
3185 // Generate a stub that calls into vm to find out the proper destination
3186 // of a java call. All the argument registers are live at this point
3187 // but since this is generic code we don't know what they are and the caller
3188 // must do any gc of the args.
3189 //
3190 RuntimeStub* SharedRuntime::generate_resolve_blob(SharedStubId id, address destination) {
3191   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3192   assert(is_resolve_id(id), "expected a resolve stub id");
3193 
3194   // allocate space for the code
3195   ResourceMark rm;
3196 
3197   const char* name = SharedRuntime::stub_name(id);
3198   CodeBuffer buffer(name, 1552, 512);
3199   MacroAssembler* masm = new MacroAssembler(&buffer);
3200 
3201   int frame_size_in_words;
3202 
3203   OopMapSet *oop_maps = new OopMapSet();
3204   OopMap* map = nullptr;
3205 
3206   int start = __ offset();
3207 
3208   // No need to save vector registers since they are caller-saved anyway.
3209   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3210 
3211   int frame_complete = __ offset();
3212 
3213   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3214 
3215   __ mov(c_rarg0, r15_thread);
3216 
3217   __ call(RuntimeAddress(destination));
3218 
3219 
3220   // Set an oopmap for the call site.
3221   // We need this not only for callee-saved registers, but also for volatile
3222   // registers that the compiler might be keeping live across a safepoint.
3223 
3224   oop_maps->add_gc_map( __ offset() - start, map);
3225 
3226   // rax contains the address we are going to jump to assuming no exception got installed
3227 
3228   // clear last_Java_sp
3229   __ reset_last_Java_frame(false);
3230   // check for pending exceptions
3231   Label pending;
3232   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3233   __ jcc(Assembler::notEqual, pending);
3234 
3235   // get the returned Method*
3236   __ get_vm_result_metadata(rbx);
3237   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3238 
3239   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3240 
3241   RegisterSaver::restore_live_registers(masm);
3242 
3243   // We are back to the original state on entry and ready to go.
3244 
3245   __ jmp(rax);
3246 
3247   // Pending exception after the safepoint
3248 
3249   __ bind(pending);
3250 
3251   RegisterSaver::restore_live_registers(masm);
3252 
3253   // exception pending => remove activation and forward to exception handler
3254 
3255   __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
3256 
3257   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3258   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3259 
3260   // -------------
3261   // make sure all code is generated
3262   masm->flush();
3263 
3264   // return the  blob
3265   // frame_size_words or bytes??
3266   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3267 }
3268 
3269 // Continuation point for throwing of implicit exceptions that are
3270 // not handled in the current activation. Fabricates an exception
3271 // oop and initiates normal exception dispatching in this
3272 // frame. Since we need to preserve callee-saved values (currently
3273 // only for C2, but done for C1 as well) we need a callee-saved oop
3274 // map and therefore have to make these stubs into RuntimeStubs
3275 // rather than BufferBlobs.  If the compiler needs all registers to
3276 // be preserved between the fault point and the exception handler
3277 // then it must assume responsibility for that in
3278 // AbstractCompiler::continuation_for_implicit_null_exception or
3279 // continuation_for_implicit_division_by_zero_exception. All other
3280 // implicit exceptions (e.g., NullPointerException or
3281 // AbstractMethodError on entry) are either at call sites or
3282 // otherwise assume that stack unwinding will be initiated, so
3283 // caller saved registers were assumed volatile in the compiler.
3284 RuntimeStub* SharedRuntime::generate_throw_exception(SharedStubId id, address runtime_entry) {
3285   assert(is_throw_id(id), "expected a throw stub id");
3286 
3287   const char* name = SharedRuntime::stub_name(id);
3288 
3289   // Information about frame layout at time of blocking runtime call.
3290   // Note that we only have to preserve callee-saved registers since
3291   // the compilers are responsible for supplying a continuation point
3292   // if they expect all registers to be preserved.
3293   enum layout {
3294     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3295     rbp_off2,
3296     return_off,
3297     return_off2,
3298     framesize // inclusive of return address
3299   };
3300 
3301   int insts_size = 512;
3302   int locs_size  = 64;
3303 
3304   ResourceMark rm;
3305   const char* timer_msg = "SharedRuntime generate_throw_exception";
3306   TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3307 
3308   CodeBuffer code(name, insts_size, locs_size);
3309   OopMapSet* oop_maps  = new OopMapSet();
3310   MacroAssembler* masm = new MacroAssembler(&code);
3311 
3312   address start = __ pc();
3313 
3314   // This is an inlined and slightly modified version of call_VM
3315   // which has the ability to fetch the return PC out of
3316   // thread-local storage and also sets up last_Java_sp slightly
3317   // differently than the real call_VM
3318 
3319   __ enter(); // required for proper stackwalking of RuntimeStub frame
3320 
3321   assert(is_even(framesize/2), "sp not 16-byte aligned");
3322 
3323   // return address and rbp are already in place
3324   __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3325 
3326   int frame_complete = __ pc() - start;
3327 
3328   // Set up last_Java_sp and last_Java_fp
3329   address the_pc = __ pc();
3330   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3331   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3332 
3333   // Call runtime
3334   __ movptr(c_rarg0, r15_thread);
3335   BLOCK_COMMENT("call runtime_entry");
3336   __ call(RuntimeAddress(runtime_entry));
3337 
3338   // Generate oop map
3339   OopMap* map = new OopMap(framesize, 0);
3340 
3341   oop_maps->add_gc_map(the_pc - start, map);
3342 
3343   __ reset_last_Java_frame(true);
3344 
3345   __ leave(); // required for proper stackwalking of RuntimeStub frame
3346 
3347   // check for pending exceptions
3348 #ifdef ASSERT
3349   Label L;
3350   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3351   __ jcc(Assembler::notEqual, L);
3352   __ should_not_reach_here();
3353   __ bind(L);
3354 #endif // ASSERT
3355   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3356 
3357 
3358   // codeBlob framesize is in words (not VMRegImpl::slot_size)
3359   RuntimeStub* stub =
3360     RuntimeStub::new_runtime_stub(name,
3361                                   &code,
3362                                   frame_complete,
3363                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3364                                   oop_maps, false);
3365   return stub;
3366 }
3367 
3368 //------------------------------Montgomery multiplication------------------------
3369 //
3370 
3371 #ifndef _WINDOWS
3372 
3373 // Subtract 0:b from carry:a.  Return carry.
3374 static julong
3375 sub(julong a[], julong b[], julong carry, long len) {
3376   long long i = 0, cnt = len;
3377   julong tmp;
3378   asm volatile("clc; "
3379                "0: ; "
3380                "mov (%[b], %[i], 8), %[tmp]; "
3381                "sbb %[tmp], (%[a], %[i], 8); "
3382                "inc %[i]; dec %[cnt]; "
3383                "jne 0b; "
3384                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3385                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3386                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3387                : "memory");
3388   return tmp;
3389 }
3390 
3391 // Multiply (unsigned) Long A by Long B, accumulating the double-
3392 // length result into the accumulator formed of T0, T1, and T2.
3393 #define MACC(A, B, T0, T1, T2)                                  \
3394 do {                                                            \
3395   unsigned long hi, lo;                                         \
3396   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3397            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3398            : "r"(A), "a"(B) : "cc");                            \
3399  } while(0)
3400 
3401 // As above, but add twice the double-length result into the
3402 // accumulator.
3403 #define MACC2(A, B, T0, T1, T2)                                 \
3404 do {                                                            \
3405   unsigned long hi, lo;                                         \
3406   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3407            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3408            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3409            : "r"(A), "a"(B) : "cc");                            \
3410  } while(0)
3411 
3412 #else //_WINDOWS
3413 
3414 static julong
3415 sub(julong a[], julong b[], julong carry, long len) {
3416   long i;
3417   julong tmp;
3418   unsigned char c = 1;
3419   for (i = 0; i < len; i++) {
3420     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3421     a[i] = tmp;
3422   }
3423   c = _addcarry_u64(c, carry, ~0, &tmp);
3424   return tmp;
3425 }
3426 
3427 // Multiply (unsigned) Long A by Long B, accumulating the double-
3428 // length result into the accumulator formed of T0, T1, and T2.
3429 #define MACC(A, B, T0, T1, T2)                          \
3430 do {                                                    \
3431   julong hi, lo;                            \
3432   lo = _umul128(A, B, &hi);                             \
3433   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3434   c = _addcarry_u64(c, hi, T1, &T1);                    \
3435   _addcarry_u64(c, T2, 0, &T2);                         \
3436  } while(0)
3437 
3438 // As above, but add twice the double-length result into the
3439 // accumulator.
3440 #define MACC2(A, B, T0, T1, T2)                         \
3441 do {                                                    \
3442   julong hi, lo;                            \
3443   lo = _umul128(A, B, &hi);                             \
3444   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3445   c = _addcarry_u64(c, hi, T1, &T1);                    \
3446   _addcarry_u64(c, T2, 0, &T2);                         \
3447   c = _addcarry_u64(0, lo, T0, &T0);                    \
3448   c = _addcarry_u64(c, hi, T1, &T1);                    \
3449   _addcarry_u64(c, T2, 0, &T2);                         \
3450  } while(0)
3451 
3452 #endif //_WINDOWS
3453 
3454 // Fast Montgomery multiplication.  The derivation of the algorithm is
3455 // in  A Cryptographic Library for the Motorola DSP56000,
3456 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3457 
3458 static void NOINLINE
3459 montgomery_multiply(julong a[], julong b[], julong n[],
3460                     julong m[], julong inv, int len) {
3461   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3462   int i;
3463 
3464   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3465 
3466   for (i = 0; i < len; i++) {
3467     int j;
3468     for (j = 0; j < i; j++) {
3469       MACC(a[j], b[i-j], t0, t1, t2);
3470       MACC(m[j], n[i-j], t0, t1, t2);
3471     }
3472     MACC(a[i], b[0], t0, t1, t2);
3473     m[i] = t0 * inv;
3474     MACC(m[i], n[0], t0, t1, t2);
3475 
3476     assert(t0 == 0, "broken Montgomery multiply");
3477 
3478     t0 = t1; t1 = t2; t2 = 0;
3479   }
3480 
3481   for (i = len; i < 2*len; i++) {
3482     int j;
3483     for (j = i-len+1; j < len; j++) {
3484       MACC(a[j], b[i-j], t0, t1, t2);
3485       MACC(m[j], n[i-j], t0, t1, t2);
3486     }
3487     m[i-len] = t0;
3488     t0 = t1; t1 = t2; t2 = 0;
3489   }
3490 
3491   while (t0)
3492     t0 = sub(m, n, t0, len);
3493 }
3494 
3495 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3496 // multiplies so it should be up to 25% faster than Montgomery
3497 // multiplication.  However, its loop control is more complex and it
3498 // may actually run slower on some machines.
3499 
3500 static void NOINLINE
3501 montgomery_square(julong a[], julong n[],
3502                   julong m[], julong inv, int len) {
3503   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3504   int i;
3505 
3506   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3507 
3508   for (i = 0; i < len; i++) {
3509     int j;
3510     int end = (i+1)/2;
3511     for (j = 0; j < end; j++) {
3512       MACC2(a[j], a[i-j], t0, t1, t2);
3513       MACC(m[j], n[i-j], t0, t1, t2);
3514     }
3515     if ((i & 1) == 0) {
3516       MACC(a[j], a[j], t0, t1, t2);
3517     }
3518     for (; j < i; j++) {
3519       MACC(m[j], n[i-j], t0, t1, t2);
3520     }
3521     m[i] = t0 * inv;
3522     MACC(m[i], n[0], t0, t1, t2);
3523 
3524     assert(t0 == 0, "broken Montgomery square");
3525 
3526     t0 = t1; t1 = t2; t2 = 0;
3527   }
3528 
3529   for (i = len; i < 2*len; i++) {
3530     int start = i-len+1;
3531     int end = start + (len - start)/2;
3532     int j;
3533     for (j = start; j < end; j++) {
3534       MACC2(a[j], a[i-j], t0, t1, t2);
3535       MACC(m[j], n[i-j], t0, t1, t2);
3536     }
3537     if ((i & 1) == 0) {
3538       MACC(a[j], a[j], t0, t1, t2);
3539     }
3540     for (; j < len; j++) {
3541       MACC(m[j], n[i-j], t0, t1, t2);
3542     }
3543     m[i-len] = t0;
3544     t0 = t1; t1 = t2; t2 = 0;
3545   }
3546 
3547   while (t0)
3548     t0 = sub(m, n, t0, len);
3549 }
3550 
3551 // Swap words in a longword.
3552 static julong swap(julong x) {
3553   return (x << 32) | (x >> 32);
3554 }
3555 
3556 // Copy len longwords from s to d, word-swapping as we go.  The
3557 // destination array is reversed.
3558 static void reverse_words(julong *s, julong *d, int len) {
3559   d += len;
3560   while(len-- > 0) {
3561     d--;
3562     *d = swap(*s);
3563     s++;
3564   }
3565 }
3566 
3567 // The threshold at which squaring is advantageous was determined
3568 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3569 #define MONTGOMERY_SQUARING_THRESHOLD 64
3570 
3571 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3572                                         jint len, jlong inv,
3573                                         jint *m_ints) {
3574   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3575   int longwords = len/2;
3576 
3577   // Make very sure we don't use so much space that the stack might
3578   // overflow.  512 jints corresponds to an 16384-bit integer and
3579   // will use here a total of 8k bytes of stack space.
3580   int divisor = sizeof(julong) * 4;
3581   guarantee(longwords <= 8192 / divisor, "must be");
3582   int total_allocation = longwords * sizeof (julong) * 4;
3583   julong *scratch = (julong *)alloca(total_allocation);
3584 
3585   // Local scratch arrays
3586   julong
3587     *a = scratch + 0 * longwords,
3588     *b = scratch + 1 * longwords,
3589     *n = scratch + 2 * longwords,
3590     *m = scratch + 3 * longwords;
3591 
3592   reverse_words((julong *)a_ints, a, longwords);
3593   reverse_words((julong *)b_ints, b, longwords);
3594   reverse_words((julong *)n_ints, n, longwords);
3595 
3596   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3597 
3598   reverse_words(m, (julong *)m_ints, longwords);
3599 }
3600 
3601 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3602                                       jint len, jlong inv,
3603                                       jint *m_ints) {
3604   assert(len % 2 == 0, "array length in montgomery_square must be even");
3605   int longwords = len/2;
3606 
3607   // Make very sure we don't use so much space that the stack might
3608   // overflow.  512 jints corresponds to an 16384-bit integer and
3609   // will use here a total of 6k bytes of stack space.
3610   int divisor = sizeof(julong) * 3;
3611   guarantee(longwords <= (8192 / divisor), "must be");
3612   int total_allocation = longwords * sizeof (julong) * 3;
3613   julong *scratch = (julong *)alloca(total_allocation);
3614 
3615   // Local scratch arrays
3616   julong
3617     *a = scratch + 0 * longwords,
3618     *n = scratch + 1 * longwords,
3619     *m = scratch + 2 * longwords;
3620 
3621   reverse_words((julong *)a_ints, a, longwords);
3622   reverse_words((julong *)n_ints, n, longwords);
3623 
3624   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3625     ::montgomery_square(a, n, m, (julong)inv, longwords);
3626   } else {
3627     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3628   }
3629 
3630   reverse_words(m, (julong *)m_ints, longwords);
3631 }
3632 
3633 #if INCLUDE_JFR
3634 
3635 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
3636 // It returns a jobject handle to the event writer.
3637 // The handle is dereferenced and the return value is the event writer oop.
3638 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
3639   enum layout {
3640     rbp_off,
3641     rbpH_off,
3642     return_off,
3643     return_off2,
3644     framesize // inclusive of return address
3645   };
3646 
3647   const char* name = SharedRuntime::stub_name(SharedStubId::jfr_write_checkpoint_id);
3648   CodeBuffer code(name, 1024, 64);
3649   MacroAssembler* masm = new MacroAssembler(&code);
3650   address start = __ pc();
3651 
3652   __ enter();
3653   address the_pc = __ pc();
3654 
3655   int frame_complete = the_pc - start;
3656 
3657   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3658   __ movptr(c_rarg0, r15_thread);
3659   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
3660   __ reset_last_Java_frame(true);
3661 
3662   // rax is jobject handle result, unpack and process it through a barrier.
3663   __ resolve_global_jobject(rax, c_rarg0);
3664 
3665   __ leave();
3666   __ ret(0);
3667 
3668   OopMapSet* oop_maps = new OopMapSet();
3669   OopMap* map = new OopMap(framesize, 1);
3670   oop_maps->add_gc_map(frame_complete, map);
3671 
3672   RuntimeStub* stub =
3673     RuntimeStub::new_runtime_stub(name,
3674                                   &code,
3675                                   frame_complete,
3676                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3677                                   oop_maps,
3678                                   false);
3679   return stub;
3680 }
3681 
3682 // For c2: call to return a leased buffer.
3683 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
3684   enum layout {
3685     rbp_off,
3686     rbpH_off,
3687     return_off,
3688     return_off2,
3689     framesize // inclusive of return address
3690   };
3691 
3692   const char* name = SharedRuntime::stub_name(SharedStubId::jfr_return_lease_id);
3693   CodeBuffer code(name, 1024, 64);
3694   MacroAssembler* masm = new MacroAssembler(&code);
3695   address start = __ pc();
3696 
3697   __ enter();
3698   address the_pc = __ pc();
3699 
3700   int frame_complete = the_pc - start;
3701 
3702   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
3703   __ movptr(c_rarg0, r15_thread);
3704   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
3705   __ reset_last_Java_frame(true);
3706 
3707   __ leave();
3708   __ ret(0);
3709 
3710   OopMapSet* oop_maps = new OopMapSet();
3711   OopMap* map = new OopMap(framesize, 1);
3712   oop_maps->add_gc_map(frame_complete, map);
3713 
3714   RuntimeStub* stub =
3715     RuntimeStub::new_runtime_stub(name,
3716                                   &code,
3717                                   frame_complete,
3718                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3719                                   oop_maps,
3720                                   false);
3721   return stub;
3722 }
3723 
3724 #endif // INCLUDE_JFR
3725