1 /*
   2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef _WINDOWS
  26 #include "alloca.h"
  27 #endif
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "code/compiledIC.hpp"
  31 #include "code/debugInfoRec.hpp"
  32 #include "code/nativeInst.hpp"
  33 #include "code/vtableStubs.hpp"
  34 #include "compiler/oopMap.hpp"
  35 #include "gc/shared/collectedHeap.hpp"
  36 #include "gc/shared/gcLocker.hpp"
  37 #include "gc/shared/barrierSet.hpp"
  38 #include "gc/shared/barrierSetAssembler.hpp"
  39 #include "interpreter/interpreter.hpp"
  40 #include "logging/log.hpp"
  41 #include "memory/resourceArea.hpp"
  42 #include "memory/universe.hpp"
  43 #include "oops/klass.inline.hpp"
  44 #include "oops/method.inline.hpp"
  45 #include "prims/methodHandles.hpp"
  46 #include "runtime/continuation.hpp"
  47 #include "runtime/continuationEntry.inline.hpp"
  48 #include "runtime/globals.hpp"
  49 #include "runtime/jniHandles.hpp"
  50 #include "runtime/safepointMechanism.hpp"
  51 #include "runtime/sharedRuntime.hpp"
  52 #include "runtime/signature.hpp"
  53 #include "runtime/stubRoutines.hpp"
  54 #include "runtime/timerTrace.hpp"
  55 #include "runtime/vframeArray.hpp"
  56 #include "runtime/vm_version.hpp"
  57 #include "utilities/align.hpp"
  58 #include "utilities/checkedCast.hpp"
  59 #include "utilities/formatBuffer.hpp"
  60 #include "vmreg_x86.inline.hpp"
  61 #ifdef COMPILER1
  62 #include "c1/c1_Runtime1.hpp"
  63 #endif
  64 #ifdef COMPILER2
  65 #include "opto/runtime.hpp"
  66 #endif
  67 #if INCLUDE_JVMCI
  68 #include "jvmci/jvmciJavaClasses.hpp"
  69 #endif
  70 
  71 #define __ masm->
  72 
  73 #ifdef PRODUCT
  74 #define BLOCK_COMMENT(str) /* nothing */
  75 #else
  76 #define BLOCK_COMMENT(str) __ block_comment(str)
  77 #endif // PRODUCT
  78 
  79 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  80 
  81 class RegisterSaver {
  82   // Capture info about frame layout.  Layout offsets are in jint
  83   // units because compiler frame slots are jints.
  84 #define XSAVE_AREA_BEGIN 160
  85 #define XSAVE_AREA_YMM_BEGIN 576
  86 #define XSAVE_AREA_EGPRS 960
  87 #define XSAVE_AREA_OPMASK_BEGIN 1088
  88 #define XSAVE_AREA_ZMM_BEGIN 1152
  89 #define XSAVE_AREA_UPPERBANK 1664
  90 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  91 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  92 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
  93 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
  94 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
  95   enum layout {
  96     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt,    // fxsave save area
  97     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,  // offset in fxsave save area
  98     DEF_XMM_OFFS(0),
  99     DEF_XMM_OFFS(1),
 100     // 2..15 are implied in range usage
 101     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 102     DEF_YMM_OFFS(0),
 103     DEF_YMM_OFFS(1),
 104     r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
 105     r16H_off,
 106     r17_off, r17H_off,
 107     r18_off, r18H_off,
 108     r19_off, r19H_off,
 109     r20_off, r20H_off,
 110     r21_off, r21H_off,
 111     r22_off, r22H_off,
 112     r23_off, r23H_off,
 113     r24_off, r24H_off,
 114     r25_off, r25H_off,
 115     r26_off, r26H_off,
 116     r27_off, r27H_off,
 117     r28_off, r28H_off,
 118     r29_off, r29H_off,
 119     r30_off, r30H_off,
 120     r31_off, r31H_off,
 121     opmask_off   = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 122     DEF_OPMASK_OFFS(0),
 123     DEF_OPMASK_OFFS(1),
 124     // 2..7 are implied in range usage
 125     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 126     DEF_ZMM_OFFS(0),
 127     DEF_ZMM_OFFS(1),
 128     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 129     DEF_ZMM_UPPER_OFFS(16),
 130     DEF_ZMM_UPPER_OFFS(17),
 131     // 18..31 are implied in range usage
 132     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 133     fpu_stateH_end,
 134     r15_off, r15H_off,
 135     r14_off, r14H_off,
 136     r13_off, r13H_off,
 137     r12_off, r12H_off,
 138     r11_off, r11H_off,
 139     r10_off, r10H_off,
 140     r9_off,  r9H_off,
 141     r8_off,  r8H_off,
 142     rdi_off, rdiH_off,
 143     rsi_off, rsiH_off,
 144     ignore_off, ignoreH_off,  // extra copy of rbp
 145     rsp_off, rspH_off,
 146     rbx_off, rbxH_off,
 147     rdx_off, rdxH_off,
 148     rcx_off, rcxH_off,
 149     rax_off, raxH_off,
 150     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 151     align_off, alignH_off,
 152     flags_off, flagsH_off,
 153     // The frame sender code expects that rbp will be in the "natural" place and
 154     // will override any oopMap setting for it. We must therefore force the layout
 155     // so that it agrees with the frame sender code.
 156     rbp_off, rbpH_off,        // copy of rbp we will restore
 157     return_off, returnH_off,  // slot for return address
 158     reg_save_size             // size in compiler stack slots
 159   };
 160 
 161  public:
 162   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 163   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 164 
 165   // Offsets into the register save area
 166   // Used by deoptimization when it is managing result register
 167   // values on its own
 168 
 169   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 170   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 171   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 172   static int r15_offset_in_bytes(void)    { return BytesPerInt * r15_off; }
 173   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 174   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 175 
 176   // During deoptimization only the result registers need to be restored,
 177   // all the other values have already been extracted.
 178   static void restore_result_registers(MacroAssembler* masm);
 179 };
 180 
 181 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 182   int off = 0;
 183   int num_xmm_regs = XMMRegister::available_xmm_registers();
 184 #if COMPILER2_OR_JVMCI
 185   if (save_wide_vectors && UseAVX == 0) {
 186     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 187   }
 188   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 189 #else
 190   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 191 #endif
 192 
 193   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 194   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 195   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 196   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 197   // CodeBlob frame size is in words.
 198   int frame_size_in_words = frame_size_in_bytes / wordSize;
 199   *total_frame_words = frame_size_in_words;
 200 
 201   // Save registers, fpu state, and flags.
 202   // We assume caller has already pushed the return address onto the
 203   // stack, so rsp is 8-byte aligned here.
 204   // We push rpb twice in this sequence because we want the real rbp
 205   // to be under the return like a normal enter.
 206 
 207   __ enter();          // rsp becomes 16-byte aligned here
 208   __ pushf();
 209   // Make sure rsp stays 16-byte aligned
 210   __ subq(rsp, 8);
 211   // Push CPU state in multiple of 16 bytes
 212   __ save_legacy_gprs();
 213   __ push_FPU_state();
 214 
 215 
 216   // push cpu state handles this on EVEX enabled targets
 217   if (save_wide_vectors) {
 218     // Save upper half of YMM registers(0..15)
 219     int base_addr = XSAVE_AREA_YMM_BEGIN;
 220     for (int n = 0; n < 16; n++) {
 221       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 222     }
 223     if (VM_Version::supports_evex()) {
 224       // Save upper half of ZMM registers(0..15)
 225       base_addr = XSAVE_AREA_ZMM_BEGIN;
 226       for (int n = 0; n < 16; n++) {
 227         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 228       }
 229       // Save full ZMM registers(16..num_xmm_regs)
 230       base_addr = XSAVE_AREA_UPPERBANK;
 231       off = 0;
 232       int vector_len = Assembler::AVX_512bit;
 233       for (int n = 16; n < num_xmm_regs; n++) {
 234         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 235       }
 236 #if COMPILER2_OR_JVMCI
 237       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 238       off = 0;
 239       for(int n = 0; n < KRegister::number_of_registers; n++) {
 240         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 241       }
 242 #endif
 243     }
 244   } else {
 245     if (VM_Version::supports_evex()) {
 246       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 247       int base_addr = XSAVE_AREA_UPPERBANK;
 248       off = 0;
 249       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 250       for (int n = 16; n < num_xmm_regs; n++) {
 251         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 252       }
 253 #if COMPILER2_OR_JVMCI
 254       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 255       off = 0;
 256       for(int n = 0; n < KRegister::number_of_registers; n++) {
 257         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 258       }
 259 #endif
 260     }
 261   }
 262 
 263 #if COMPILER2_OR_JVMCI
 264   if (UseAPX) {
 265       int base_addr = XSAVE_AREA_EGPRS;
 266       off = 0;
 267       for (int n = 16; n < Register::number_of_registers; n++) {
 268         __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
 269       }
 270   }
 271 #endif
 272 
 273   __ vzeroupper();
 274   if (frame::arg_reg_save_area_bytes != 0) {
 275     // Allocate argument register save area
 276     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 277   }
 278 
 279   // Set an oopmap for the call site.  This oopmap will map all
 280   // oop-registers and debug-info registers as callee-saved.  This
 281   // will allow deoptimization at this safepoint to find all possible
 282   // debug-info recordings, as well as let GC find all oops.
 283 
 284   OopMapSet *oop_maps = new OopMapSet();
 285   OopMap* map = new OopMap(frame_size_in_slots, 0);
 286 
 287 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 288 
 289   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 290   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 291   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 292   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 293   // rbp location is known implicitly by the frame sender code, needs no oopmap
 294   // and the location where rbp was saved by is ignored
 295   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 296   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 297   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 298   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 299   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 300   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 301   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 302   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 303   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 304   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 305 
 306   if (UseAPX) {
 307     map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
 308     map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
 309     map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
 310     map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
 311     map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
 312     map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
 313     map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
 314     map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
 315     map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
 316     map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
 317     map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
 318     map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
 319     map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
 320     map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
 321     map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
 322     map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
 323   }
 324   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 325   // on EVEX enabled targets, we get it included in the xsave area
 326   off = xmm0_off;
 327   int delta = xmm1_off - off;
 328   for (int n = 0; n < 16; n++) {
 329     XMMRegister xmm_name = as_XMMRegister(n);
 330     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 331     off += delta;
 332   }
 333   if (UseAVX > 2) {
 334     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 335     off = zmm16_off;
 336     delta = zmm17_off - off;
 337     for (int n = 16; n < num_xmm_regs; n++) {
 338       XMMRegister zmm_name = as_XMMRegister(n);
 339       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 340       off += delta;
 341     }
 342   }
 343 
 344 #if COMPILER2_OR_JVMCI
 345   if (save_wide_vectors) {
 346     // Save upper half of YMM registers(0..15)
 347     off = ymm0_off;
 348     delta = ymm1_off - ymm0_off;
 349     for (int n = 0; n < 16; n++) {
 350       XMMRegister ymm_name = as_XMMRegister(n);
 351       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 352       off += delta;
 353     }
 354     if (VM_Version::supports_evex()) {
 355       // Save upper half of ZMM registers(0..15)
 356       off = zmm0_off;
 357       delta = zmm1_off - zmm0_off;
 358       for (int n = 0; n < 16; n++) {
 359         XMMRegister zmm_name = as_XMMRegister(n);
 360         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 361         off += delta;
 362       }
 363     }
 364   }
 365 #endif // COMPILER2_OR_JVMCI
 366 
 367   // %%% These should all be a waste but we'll keep things as they were for now
 368   if (true) {
 369     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 370     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 371     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 372     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 373     // rbp location is known implicitly by the frame sender code, needs no oopmap
 374     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 375     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 376     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 377     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 378     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 379     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 380     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 381     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 382     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 383     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 384     if (UseAPX) {
 385       map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
 386       map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
 387       map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
 388       map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
 389       map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
 390       map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
 391       map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
 392       map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
 393       map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
 394       map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
 395       map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
 396       map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
 397       map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
 398       map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
 399       map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
 400       map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
 401     }
 402     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 403     // on EVEX enabled targets, we get it included in the xsave area
 404     off = xmm0H_off;
 405     delta = xmm1H_off - off;
 406     for (int n = 0; n < 16; n++) {
 407       XMMRegister xmm_name = as_XMMRegister(n);
 408       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 409       off += delta;
 410     }
 411     if (UseAVX > 2) {
 412       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 413       off = zmm16H_off;
 414       delta = zmm17H_off - off;
 415       for (int n = 16; n < num_xmm_regs; n++) {
 416         XMMRegister zmm_name = as_XMMRegister(n);
 417         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 418         off += delta;
 419       }
 420     }
 421   }
 422 
 423   return map;
 424 }
 425 
 426 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 427   int num_xmm_regs = XMMRegister::available_xmm_registers();
 428   if (frame::arg_reg_save_area_bytes != 0) {
 429     // Pop arg register save area
 430     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 431   }
 432 
 433 #if COMPILER2_OR_JVMCI
 434   if (restore_wide_vectors) {
 435     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 436     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 437   }
 438 #else
 439   assert(!restore_wide_vectors, "vectors are generated only by C2");
 440 #endif
 441 
 442   __ vzeroupper();
 443 
 444   // On EVEX enabled targets everything is handled in pop fpu state
 445   if (restore_wide_vectors) {
 446     // Restore upper half of YMM registers (0..15)
 447     int base_addr = XSAVE_AREA_YMM_BEGIN;
 448     for (int n = 0; n < 16; n++) {
 449       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 450     }
 451     if (VM_Version::supports_evex()) {
 452       // Restore upper half of ZMM registers (0..15)
 453       base_addr = XSAVE_AREA_ZMM_BEGIN;
 454       for (int n = 0; n < 16; n++) {
 455         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 456       }
 457       // Restore full ZMM registers(16..num_xmm_regs)
 458       base_addr = XSAVE_AREA_UPPERBANK;
 459       int vector_len = Assembler::AVX_512bit;
 460       int off = 0;
 461       for (int n = 16; n < num_xmm_regs; n++) {
 462         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 463       }
 464 #if COMPILER2_OR_JVMCI
 465       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 466       off = 0;
 467       for (int n = 0; n < KRegister::number_of_registers; n++) {
 468         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 469       }
 470 #endif
 471     }
 472   } else {
 473     if (VM_Version::supports_evex()) {
 474       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 475       int base_addr = XSAVE_AREA_UPPERBANK;
 476       int off = 0;
 477       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 478       for (int n = 16; n < num_xmm_regs; n++) {
 479         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 480       }
 481 #if COMPILER2_OR_JVMCI
 482       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 483       off = 0;
 484       for (int n = 0; n < KRegister::number_of_registers; n++) {
 485         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 486       }
 487 #endif
 488     }
 489   }
 490 
 491 #if COMPILER2_OR_JVMCI
 492   if (UseAPX) {
 493     int base_addr = XSAVE_AREA_EGPRS;
 494     int off = 0;
 495     for (int n = 16; n < Register::number_of_registers; n++) {
 496       __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
 497     }
 498   }
 499 #endif
 500 
 501   // Recover CPU state
 502   __ pop_FPU_state();
 503   __ restore_legacy_gprs();
 504   __ addq(rsp, 8);
 505   __ popf();
 506   // Get the rbp described implicitly by the calling convention (no oopMap)
 507   __ pop(rbp);
 508 }
 509 
 510 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 511 
 512   // Just restore result register. Only used by deoptimization. By
 513   // now any callee save register that needs to be restored to a c2
 514   // caller of the deoptee has been extracted into the vframeArray
 515   // and will be stuffed into the c2i adapter we create for later
 516   // restoration so only result registers need to be restored here.
 517 
 518   // Restore fp result register
 519   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 520   // Restore integer result register
 521   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 522   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 523 
 524   // Pop all of the register save are off the stack except the return address
 525   __ addptr(rsp, return_offset_in_bytes());
 526 }
 527 
 528 // Is vector's size (in bytes) bigger than a size saved by default?
 529 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 530 bool SharedRuntime::is_wide_vector(int size) {
 531   return size > 16;
 532 }
 533 
 534 // ---------------------------------------------------------------------------
 535 // Read the array of BasicTypes from a signature, and compute where the
 536 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 537 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 538 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 539 // as framesizes are fixed.
 540 // VMRegImpl::stack0 refers to the first slot 0(sp).
 541 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 542 // Register up to Register::number_of_registers are the 64-bit
 543 // integer registers.
 544 
 545 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 546 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 547 // units regardless of build. Of course for i486 there is no 64 bit build
 548 
 549 // The Java calling convention is a "shifted" version of the C ABI.
 550 // By skipping the first C ABI register we can call non-static jni methods
 551 // with small numbers of arguments without having to shuffle the arguments
 552 // at all. Since we control the java ABI we ought to at least get some
 553 // advantage out of it.
 554 
 555 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 556                                            VMRegPair *regs,
 557                                            int total_args_passed) {
 558 
 559   // Create the mapping between argument positions and
 560   // registers.
 561   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 562     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 563   };
 564   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 565     j_farg0, j_farg1, j_farg2, j_farg3,
 566     j_farg4, j_farg5, j_farg6, j_farg7
 567   };
 568 
 569 
 570   uint int_args = 0;
 571   uint fp_args = 0;
 572   uint stk_args = 0;
 573 
 574   for (int i = 0; i < total_args_passed; i++) {
 575     switch (sig_bt[i]) {
 576     case T_BOOLEAN:
 577     case T_CHAR:
 578     case T_BYTE:
 579     case T_SHORT:
 580     case T_INT:
 581       if (int_args < Argument::n_int_register_parameters_j) {
 582         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 583       } else {
 584         stk_args = align_up(stk_args, 2);
 585         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 586         stk_args += 1;
 587       }
 588       break;
 589     case T_VOID:
 590       // halves of T_LONG or T_DOUBLE
 591       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 592       regs[i].set_bad();
 593       break;
 594     case T_LONG:
 595       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 596       // fall through
 597     case T_OBJECT:
 598     case T_ARRAY:
 599     case T_ADDRESS:
 600       if (int_args < Argument::n_int_register_parameters_j) {
 601         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 602       } else {
 603         stk_args = align_up(stk_args, 2);
 604         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 605         stk_args += 2;
 606       }
 607       break;
 608     case T_FLOAT:
 609       if (fp_args < Argument::n_float_register_parameters_j) {
 610         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 611       } else {
 612         stk_args = align_up(stk_args, 2);
 613         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 614         stk_args += 1;
 615       }
 616       break;
 617     case T_DOUBLE:
 618       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 619       if (fp_args < Argument::n_float_register_parameters_j) {
 620         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 621       } else {
 622         stk_args = align_up(stk_args, 2);
 623         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 624         stk_args += 2;
 625       }
 626       break;
 627     default:
 628       ShouldNotReachHere();
 629       break;
 630     }
 631   }
 632 
 633   return stk_args;
 634 }
 635 
 636 // Patch the callers callsite with entry to compiled code if it exists.
 637 static void patch_callers_callsite(MacroAssembler *masm) {
 638   Label L;
 639   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 640   __ jcc(Assembler::equal, L);
 641 
 642   // Save the current stack pointer
 643   __ mov(r13, rsp);
 644   // Schedule the branch target address early.
 645   // Call into the VM to patch the caller, then jump to compiled callee
 646   // rax isn't live so capture return address while we easily can
 647   __ movptr(rax, Address(rsp, 0));
 648 
 649   // align stack so push_CPU_state doesn't fault
 650   __ andptr(rsp, -(StackAlignmentInBytes));
 651   __ push_CPU_state();
 652   __ vzeroupper();
 653   // VM needs caller's callsite
 654   // VM needs target method
 655   // This needs to be a long call since we will relocate this adapter to
 656   // the codeBuffer and it may not reach
 657 
 658   // Allocate argument register save area
 659   if (frame::arg_reg_save_area_bytes != 0) {
 660     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 661   }
 662   __ mov(c_rarg0, rbx);
 663   __ mov(c_rarg1, rax);
 664   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 665 
 666   // De-allocate argument register save area
 667   if (frame::arg_reg_save_area_bytes != 0) {
 668     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 669   }
 670 
 671   __ vzeroupper();
 672   __ pop_CPU_state();
 673   // restore sp
 674   __ mov(rsp, r13);
 675   __ bind(L);
 676 }
 677 
 678 
 679 static void gen_c2i_adapter(MacroAssembler *masm,
 680                             int total_args_passed,
 681                             int comp_args_on_stack,
 682                             const BasicType *sig_bt,
 683                             const VMRegPair *regs,
 684                             Label& skip_fixup) {
 685   // Before we get into the guts of the C2I adapter, see if we should be here
 686   // at all.  We've come from compiled code and are attempting to jump to the
 687   // interpreter, which means the caller made a static call to get here
 688   // (vcalls always get a compiled target if there is one).  Check for a
 689   // compiled target.  If there is one, we need to patch the caller's call.
 690   patch_callers_callsite(masm);
 691 
 692   __ bind(skip_fixup);
 693 
 694   // Since all args are passed on the stack, total_args_passed *
 695   // Interpreter::stackElementSize is the space we need.
 696 
 697   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 698 
 699   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 700 
 701   // stack is aligned, keep it that way
 702   // This is not currently needed or enforced by the interpreter, but
 703   // we might as well conform to the ABI.
 704   extraspace = align_up(extraspace, 2*wordSize);
 705 
 706   // set senderSP value
 707   __ lea(r13, Address(rsp, wordSize));
 708 
 709 #ifdef ASSERT
 710   __ check_stack_alignment(r13, "sender stack not aligned");
 711 #endif
 712   if (extraspace > 0) {
 713     // Pop the return address
 714     __ pop(rax);
 715 
 716     __ subptr(rsp, extraspace);
 717 
 718     // Push the return address
 719     __ push(rax);
 720 
 721     // Account for the return address location since we store it first rather
 722     // than hold it in a register across all the shuffling
 723     extraspace += wordSize;
 724   }
 725 
 726 #ifdef ASSERT
 727   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 728 #endif
 729 
 730   // Now write the args into the outgoing interpreter space
 731   for (int i = 0; i < total_args_passed; i++) {
 732     if (sig_bt[i] == T_VOID) {
 733       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 734       continue;
 735     }
 736 
 737     // offset to start parameters
 738     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 739     int next_off = st_off - Interpreter::stackElementSize;
 740 
 741     // Say 4 args:
 742     // i   st_off
 743     // 0   32 T_LONG
 744     // 1   24 T_VOID
 745     // 2   16 T_OBJECT
 746     // 3    8 T_BOOL
 747     // -    0 return address
 748     //
 749     // However to make thing extra confusing. Because we can fit a long/double in
 750     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 751     // leaves one slot empty and only stores to a single slot. In this case the
 752     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 753 
 754     VMReg r_1 = regs[i].first();
 755     VMReg r_2 = regs[i].second();
 756     if (!r_1->is_valid()) {
 757       assert(!r_2->is_valid(), "");
 758       continue;
 759     }
 760     if (r_1->is_stack()) {
 761       // memory to memory use rax
 762       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 763       if (!r_2->is_valid()) {
 764         // sign extend??
 765         __ movl(rax, Address(rsp, ld_off));
 766         __ movptr(Address(rsp, st_off), rax);
 767 
 768       } else {
 769 
 770         __ movq(rax, Address(rsp, ld_off));
 771 
 772         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 773         // T_DOUBLE and T_LONG use two slots in the interpreter
 774         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 775           // ld_off == LSW, ld_off+wordSize == MSW
 776           // st_off == MSW, next_off == LSW
 777           __ movq(Address(rsp, next_off), rax);
 778 #ifdef ASSERT
 779           // Overwrite the unused slot with known junk
 780           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 781           __ movptr(Address(rsp, st_off), rax);
 782 #endif /* ASSERT */
 783         } else {
 784           __ movq(Address(rsp, st_off), rax);
 785         }
 786       }
 787     } else if (r_1->is_Register()) {
 788       Register r = r_1->as_Register();
 789       if (!r_2->is_valid()) {
 790         // must be only an int (or less ) so move only 32bits to slot
 791         // why not sign extend??
 792         __ movl(Address(rsp, st_off), r);
 793       } else {
 794         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 795         // T_DOUBLE and T_LONG use two slots in the interpreter
 796         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 797           // long/double in gpr
 798 #ifdef ASSERT
 799           // Overwrite the unused slot with known junk
 800           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 801           __ movptr(Address(rsp, st_off), rax);
 802 #endif /* ASSERT */
 803           __ movq(Address(rsp, next_off), r);
 804         } else {
 805           __ movptr(Address(rsp, st_off), r);
 806         }
 807       }
 808     } else {
 809       assert(r_1->is_XMMRegister(), "");
 810       if (!r_2->is_valid()) {
 811         // only a float use just part of the slot
 812         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 813       } else {
 814 #ifdef ASSERT
 815         // Overwrite the unused slot with known junk
 816         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 817         __ movptr(Address(rsp, st_off), rax);
 818 #endif /* ASSERT */
 819         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 820       }
 821     }
 822   }
 823 
 824   // Schedule the branch target address early.
 825   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 826   __ jmp(rcx);
 827 }
 828 
 829 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 830                         address code_start, address code_end,
 831                         Label& L_ok) {
 832   Label L_fail;
 833   __ lea(temp_reg, AddressLiteral(code_start, relocInfo::none));
 834   __ cmpptr(pc_reg, temp_reg);
 835   __ jcc(Assembler::belowEqual, L_fail);
 836   __ lea(temp_reg, AddressLiteral(code_end, relocInfo::none));
 837   __ cmpptr(pc_reg, temp_reg);
 838   __ jcc(Assembler::below, L_ok);
 839   __ bind(L_fail);
 840 }
 841 
 842 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 843                                     int total_args_passed,
 844                                     int comp_args_on_stack,
 845                                     const BasicType *sig_bt,
 846                                     const VMRegPair *regs) {
 847 
 848   // Note: r13 contains the senderSP on entry. We must preserve it since
 849   // we may do a i2c -> c2i transition if we lose a race where compiled
 850   // code goes non-entrant while we get args ready.
 851   // In addition we use r13 to locate all the interpreter args as
 852   // we must align the stack to 16 bytes on an i2c entry else we
 853   // lose alignment we expect in all compiled code and register
 854   // save code can segv when fxsave instructions find improperly
 855   // aligned stack pointer.
 856 
 857   // Adapters can be frameless because they do not require the caller
 858   // to perform additional cleanup work, such as correcting the stack pointer.
 859   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 860   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 861   // even if a callee has modified the stack pointer.
 862   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 863   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 864   // up via the senderSP register).
 865   // In other words, if *either* the caller or callee is interpreted, we can
 866   // get the stack pointer repaired after a call.
 867   // This is why c2i and i2c adapters cannot be indefinitely composed.
 868   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 869   // both caller and callee would be compiled methods, and neither would
 870   // clean up the stack pointer changes performed by the two adapters.
 871   // If this happens, control eventually transfers back to the compiled
 872   // caller, but with an uncorrected stack, causing delayed havoc.
 873 
 874   if (VerifyAdapterCalls &&
 875       (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) {
 876     // So, let's test for cascading c2i/i2c adapters right now.
 877     //  assert(Interpreter::contains($return_addr) ||
 878     //         StubRoutines::contains($return_addr),
 879     //         "i2c adapter must return to an interpreter frame");
 880     __ block_comment("verify_i2c { ");
 881     // Pick up the return address
 882     __ movptr(rax, Address(rsp, 0));
 883     Label L_ok;
 884     if (Interpreter::code() != nullptr) {
 885       range_check(masm, rax, r11,
 886                   Interpreter::code()->code_start(),
 887                   Interpreter::code()->code_end(),
 888                   L_ok);
 889     }
 890     if (StubRoutines::initial_stubs_code() != nullptr) {
 891       range_check(masm, rax, r11,
 892                   StubRoutines::initial_stubs_code()->code_begin(),
 893                   StubRoutines::initial_stubs_code()->code_end(),
 894                   L_ok);
 895     }
 896     if (StubRoutines::final_stubs_code() != nullptr) {
 897       range_check(masm, rax, r11,
 898                   StubRoutines::final_stubs_code()->code_begin(),
 899                   StubRoutines::final_stubs_code()->code_end(),
 900                   L_ok);
 901     }
 902     const char* msg = "i2c adapter must return to an interpreter frame";
 903     __ block_comment(msg);
 904     __ stop(msg);
 905     __ bind(L_ok);
 906     __ block_comment("} verify_i2ce ");
 907   }
 908 
 909   // Must preserve original SP for loading incoming arguments because
 910   // we need to align the outgoing SP for compiled code.
 911   __ movptr(r11, rsp);
 912 
 913   // Pick up the return address
 914   __ pop(rax);
 915 
 916   // Convert 4-byte c2 stack slots to words.
 917   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 918 
 919   if (comp_args_on_stack) {
 920     __ subptr(rsp, comp_words_on_stack * wordSize);
 921   }
 922 
 923   // Ensure compiled code always sees stack at proper alignment
 924   __ andptr(rsp, -16);
 925 
 926   // push the return address and misalign the stack that youngest frame always sees
 927   // as far as the placement of the call instruction
 928   __ push(rax);
 929 
 930   // Put saved SP in another register
 931   const Register saved_sp = rax;
 932   __ movptr(saved_sp, r11);
 933 
 934   // Will jump to the compiled code just as if compiled code was doing it.
 935   // Pre-load the register-jump target early, to schedule it better.
 936   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 937 
 938 #if INCLUDE_JVMCI
 939   if (EnableJVMCI) {
 940     // check if this call should be routed towards a specific entry point
 941     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 942     Label no_alternative_target;
 943     __ jcc(Assembler::equal, no_alternative_target);
 944     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 945     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 946     __ bind(no_alternative_target);
 947   }
 948 #endif // INCLUDE_JVMCI
 949 
 950   // Now generate the shuffle code.  Pick up all register args and move the
 951   // rest through the floating point stack top.
 952   for (int i = 0; i < total_args_passed; i++) {
 953     if (sig_bt[i] == T_VOID) {
 954       // Longs and doubles are passed in native word order, but misaligned
 955       // in the 32-bit build.
 956       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 957       continue;
 958     }
 959 
 960     // Pick up 0, 1 or 2 words from SP+offset.
 961 
 962     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 963             "scrambled load targets?");
 964     // Load in argument order going down.
 965     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 966     // Point to interpreter value (vs. tag)
 967     int next_off = ld_off - Interpreter::stackElementSize;
 968     //
 969     //
 970     //
 971     VMReg r_1 = regs[i].first();
 972     VMReg r_2 = regs[i].second();
 973     if (!r_1->is_valid()) {
 974       assert(!r_2->is_valid(), "");
 975       continue;
 976     }
 977     if (r_1->is_stack()) {
 978       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 979       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 980 
 981       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 982       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 983       // will be generated.
 984       if (!r_2->is_valid()) {
 985         // sign extend???
 986         __ movl(r13, Address(saved_sp, ld_off));
 987         __ movptr(Address(rsp, st_off), r13);
 988       } else {
 989         //
 990         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 991         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 992         // So we must adjust where to pick up the data to match the interpreter.
 993         //
 994         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 995         // are accessed as negative so LSW is at LOW address
 996 
 997         // ld_off is MSW so get LSW
 998         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 999                            next_off : ld_off;
1000         __ movq(r13, Address(saved_sp, offset));
1001         // st_off is LSW (i.e. reg.first())
1002         __ movq(Address(rsp, st_off), r13);
1003       }
1004     } else if (r_1->is_Register()) {  // Register argument
1005       Register r = r_1->as_Register();
1006       assert(r != rax, "must be different");
1007       if (r_2->is_valid()) {
1008         //
1009         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1010         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1011         // So we must adjust where to pick up the data to match the interpreter.
1012 
1013         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
1014                            next_off : ld_off;
1015 
1016         // this can be a misaligned move
1017         __ movq(r, Address(saved_sp, offset));
1018       } else {
1019         // sign extend and use a full word?
1020         __ movl(r, Address(saved_sp, ld_off));
1021       }
1022     } else {
1023       if (!r_2->is_valid()) {
1024         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
1025       } else {
1026         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
1027       }
1028     }
1029   }
1030 
1031   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
1032 
1033   // 6243940 We might end up in handle_wrong_method if
1034   // the callee is deoptimized as we race thru here. If that
1035   // happens we don't want to take a safepoint because the
1036   // caller frame will look interpreted and arguments are now
1037   // "compiled" so it is much better to make this transition
1038   // invisible to the stack walking code. Unfortunately if
1039   // we try and find the callee by normal means a safepoint
1040   // is possible. So we stash the desired callee in the thread
1041   // and the vm will find there should this case occur.
1042 
1043   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1044 
1045   // put Method* where a c2i would expect should we end up there
1046   // only needed because eof c2 resolve stubs return Method* as a result in
1047   // rax
1048   __ mov(rax, rbx);
1049   __ jmp(r11);
1050 }
1051 
1052 // ---------------------------------------------------------------
1053 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
1054                                                             int total_args_passed,
1055                                                             int comp_args_on_stack,
1056                                                             const BasicType *sig_bt,
1057                                                             const VMRegPair *regs,
1058                                                             AdapterFingerPrint* fingerprint) {
1059   address i2c_entry = __ pc();
1060 
1061   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
1062 
1063   // -------------------------------------------------------------------------
1064   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1065   // to the interpreter.  The args start out packed in the compiled layout.  They
1066   // need to be unpacked into the interpreter layout.  This will almost always
1067   // require some stack space.  We grow the current (compiled) stack, then repack
1068   // the args.  We  finally end in a jump to the generic interpreter entry point.
1069   // On exit from the interpreter, the interpreter will restore our SP (lest the
1070   // compiled code, which relies solely on SP and not RBP, get sick).
1071 
1072   address c2i_unverified_entry = __ pc();
1073   Label skip_fixup;
1074 
1075   Register data = rax;
1076   Register receiver = j_rarg0;
1077   Register temp = rbx;
1078 
1079   {
1080     __ ic_check(1 /* end_alignment */);
1081     __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1082     // Method might have been compiled since the call site was patched to
1083     // interpreted if that is the case treat it as a miss so we can get
1084     // the call site corrected.
1085     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1086     __ jcc(Assembler::equal, skip_fixup);
1087     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1088   }
1089 
1090   address c2i_entry = __ pc();
1091 
1092   // Class initialization barrier for static methods
1093   address c2i_no_clinit_check_entry = nullptr;
1094   if (VM_Version::supports_fast_class_init_checks()) {
1095     Label L_skip_barrier;
1096     Register method = rbx;
1097 
1098     { // Bypass the barrier for non-static methods
1099       Register flags = rscratch1;
1100       __ load_unsigned_short(flags, Address(method, Method::access_flags_offset()));
1101       __ testl(flags, JVM_ACC_STATIC);
1102       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1103     }
1104 
1105     Register klass = rscratch1;
1106     __ load_method_holder(klass, method);
1107     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1108 
1109     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1110 
1111     __ bind(L_skip_barrier);
1112     c2i_no_clinit_check_entry = __ pc();
1113   }
1114 
1115   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1116   bs->c2i_entry_barrier(masm);
1117 
1118   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1119 
1120   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1121 }
1122 
1123 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1124                                          VMRegPair *regs,
1125                                          int total_args_passed) {
1126 
1127 // We return the amount of VMRegImpl stack slots we need to reserve for all
1128 // the arguments NOT counting out_preserve_stack_slots.
1129 
1130 // NOTE: These arrays will have to change when c1 is ported
1131 #ifdef _WIN64
1132     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1133       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1134     };
1135     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1136       c_farg0, c_farg1, c_farg2, c_farg3
1137     };
1138 #else
1139     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1140       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1141     };
1142     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1143       c_farg0, c_farg1, c_farg2, c_farg3,
1144       c_farg4, c_farg5, c_farg6, c_farg7
1145     };
1146 #endif // _WIN64
1147 
1148 
1149     uint int_args = 0;
1150     uint fp_args = 0;
1151     uint stk_args = 0; // inc by 2 each time
1152 
1153     for (int i = 0; i < total_args_passed; i++) {
1154       switch (sig_bt[i]) {
1155       case T_BOOLEAN:
1156       case T_CHAR:
1157       case T_BYTE:
1158       case T_SHORT:
1159       case T_INT:
1160         if (int_args < Argument::n_int_register_parameters_c) {
1161           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1162 #ifdef _WIN64
1163           fp_args++;
1164           // Allocate slots for callee to stuff register args the stack.
1165           stk_args += 2;
1166 #endif
1167         } else {
1168           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1169           stk_args += 2;
1170         }
1171         break;
1172       case T_LONG:
1173         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1174         // fall through
1175       case T_OBJECT:
1176       case T_ARRAY:
1177       case T_ADDRESS:
1178       case T_METADATA:
1179         if (int_args < Argument::n_int_register_parameters_c) {
1180           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1181 #ifdef _WIN64
1182           fp_args++;
1183           stk_args += 2;
1184 #endif
1185         } else {
1186           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1187           stk_args += 2;
1188         }
1189         break;
1190       case T_FLOAT:
1191         if (fp_args < Argument::n_float_register_parameters_c) {
1192           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1193 #ifdef _WIN64
1194           int_args++;
1195           // Allocate slots for callee to stuff register args the stack.
1196           stk_args += 2;
1197 #endif
1198         } else {
1199           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1200           stk_args += 2;
1201         }
1202         break;
1203       case T_DOUBLE:
1204         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1205         if (fp_args < Argument::n_float_register_parameters_c) {
1206           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1207 #ifdef _WIN64
1208           int_args++;
1209           // Allocate slots for callee to stuff register args the stack.
1210           stk_args += 2;
1211 #endif
1212         } else {
1213           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1214           stk_args += 2;
1215         }
1216         break;
1217       case T_VOID: // Halves of longs and doubles
1218         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1219         regs[i].set_bad();
1220         break;
1221       default:
1222         ShouldNotReachHere();
1223         break;
1224       }
1225     }
1226 #ifdef _WIN64
1227   // windows abi requires that we always allocate enough stack space
1228   // for 4 64bit registers to be stored down.
1229   if (stk_args < 8) {
1230     stk_args = 8;
1231   }
1232 #endif // _WIN64
1233 
1234   return stk_args;
1235 }
1236 
1237 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1238                                              uint num_bits,
1239                                              uint total_args_passed) {
1240   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1241          "only certain vector sizes are supported for now");
1242 
1243   static const XMMRegister VEC_ArgReg[32] = {
1244      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1245      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1246     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1247     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1248   };
1249 
1250   uint stk_args = 0;
1251   uint fp_args = 0;
1252 
1253   for (uint i = 0; i < total_args_passed; i++) {
1254     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1255     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1256     regs[i].set_pair(vmreg->next(next_val), vmreg);
1257   }
1258 
1259   return stk_args;
1260 }
1261 
1262 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1263   // We always ignore the frame_slots arg and just use the space just below frame pointer
1264   // which by this time is free to use
1265   switch (ret_type) {
1266   case T_FLOAT:
1267     __ movflt(Address(rbp, -wordSize), xmm0);
1268     break;
1269   case T_DOUBLE:
1270     __ movdbl(Address(rbp, -wordSize), xmm0);
1271     break;
1272   case T_VOID:  break;
1273   default: {
1274     __ movptr(Address(rbp, -wordSize), rax);
1275     }
1276   }
1277 }
1278 
1279 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1280   // We always ignore the frame_slots arg and just use the space just below frame pointer
1281   // which by this time is free to use
1282   switch (ret_type) {
1283   case T_FLOAT:
1284     __ movflt(xmm0, Address(rbp, -wordSize));
1285     break;
1286   case T_DOUBLE:
1287     __ movdbl(xmm0, Address(rbp, -wordSize));
1288     break;
1289   case T_VOID:  break;
1290   default: {
1291     __ movptr(rax, Address(rbp, -wordSize));
1292     }
1293   }
1294 }
1295 
1296 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1297     for ( int i = first_arg ; i < arg_count ; i++ ) {
1298       if (args[i].first()->is_Register()) {
1299         __ push(args[i].first()->as_Register());
1300       } else if (args[i].first()->is_XMMRegister()) {
1301         __ subptr(rsp, 2*wordSize);
1302         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1303       }
1304     }
1305 }
1306 
1307 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1308     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1309       if (args[i].first()->is_Register()) {
1310         __ pop(args[i].first()->as_Register());
1311       } else if (args[i].first()->is_XMMRegister()) {
1312         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1313         __ addptr(rsp, 2*wordSize);
1314       }
1315     }
1316 }
1317 
1318 static void verify_oop_args(MacroAssembler* masm,
1319                             const methodHandle& method,
1320                             const BasicType* sig_bt,
1321                             const VMRegPair* regs) {
1322   Register temp_reg = rbx;  // not part of any compiled calling seq
1323   if (VerifyOops) {
1324     for (int i = 0; i < method->size_of_parameters(); i++) {
1325       if (is_reference_type(sig_bt[i])) {
1326         VMReg r = regs[i].first();
1327         assert(r->is_valid(), "bad oop arg");
1328         if (r->is_stack()) {
1329           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1330           __ verify_oop(temp_reg);
1331         } else {
1332           __ verify_oop(r->as_Register());
1333         }
1334       }
1335     }
1336   }
1337 }
1338 
1339 static void check_continuation_enter_argument(VMReg actual_vmreg,
1340                                               Register expected_reg,
1341                                               const char* name) {
1342   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1343   assert(actual_vmreg->as_Register() == expected_reg,
1344          "%s is in unexpected register: %s instead of %s",
1345          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1346 }
1347 
1348 
1349 //---------------------------- continuation_enter_setup ---------------------------
1350 //
1351 // Arguments:
1352 //   None.
1353 //
1354 // Results:
1355 //   rsp: pointer to blank ContinuationEntry
1356 //
1357 // Kills:
1358 //   rax
1359 //
1360 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1361   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1362   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1363   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1364 
1365   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1366   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1367 
1368   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1369   OopMap* map = new OopMap(frame_size, 0);
1370 
1371   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1372   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1373   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1374 
1375   return map;
1376 }
1377 
1378 //---------------------------- fill_continuation_entry ---------------------------
1379 //
1380 // Arguments:
1381 //   rsp: pointer to blank Continuation entry
1382 //   reg_cont_obj: pointer to the continuation
1383 //   reg_flags: flags
1384 //
1385 // Results:
1386 //   rsp: pointer to filled out ContinuationEntry
1387 //
1388 // Kills:
1389 //   rax
1390 //
1391 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1392   assert_different_registers(rax, reg_cont_obj, reg_flags);
1393 #ifdef ASSERT
1394   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1395 #endif
1396   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1397   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1398   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1399   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1400   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1401 
1402   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1403   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1404   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1405   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1406 
1407   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1408   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1409 }
1410 
1411 //---------------------------- continuation_enter_cleanup ---------------------------
1412 //
1413 // Arguments:
1414 //   rsp: pointer to the ContinuationEntry
1415 //
1416 // Results:
1417 //   rsp: pointer to the spilled rbp in the entry frame
1418 //
1419 // Kills:
1420 //   rbx
1421 //
1422 static void continuation_enter_cleanup(MacroAssembler* masm) {
1423 #ifdef ASSERT
1424   Label L_good_sp;
1425   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1426   __ jcc(Assembler::equal, L_good_sp);
1427   __ stop("Incorrect rsp at continuation_enter_cleanup");
1428   __ bind(L_good_sp);
1429 #endif
1430   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1431   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1432 
1433   if (CheckJNICalls) {
1434     // Check if this is a virtual thread continuation
1435     Label L_skip_vthread_code;
1436     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1437     __ jcc(Assembler::equal, L_skip_vthread_code);
1438 
1439     // If the held monitor count is > 0 and this vthread is terminating then
1440     // it failed to release a JNI monitor. So we issue the same log message
1441     // that JavaThread::exit does.
1442     __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1443     __ jcc(Assembler::equal, L_skip_vthread_code);
1444 
1445     // rax may hold an exception oop, save it before the call
1446     __ push(rax);
1447     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held));
1448     __ pop(rax);
1449 
1450     // For vthreads we have to explicitly zero the JNI monitor count of the carrier
1451     // on termination. The held count is implicitly zeroed below when we restore from
1452     // the parent held count (which has to be zero).
1453     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1454 
1455     __ bind(L_skip_vthread_code);
1456   }
1457 #ifdef ASSERT
1458   else {
1459     // Check if this is a virtual thread continuation
1460     Label L_skip_vthread_code;
1461     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1462     __ jcc(Assembler::equal, L_skip_vthread_code);
1463 
1464     // See comment just above. If not checking JNI calls the JNI count is only
1465     // needed for assertion checking.
1466     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1467 
1468     __ bind(L_skip_vthread_code);
1469   }
1470 #endif
1471 
1472   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1473   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1474 
1475   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1476   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1477   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1478 }
1479 
1480 static void gen_continuation_enter(MacroAssembler* masm,
1481                                    const VMRegPair* regs,
1482                                    int& exception_offset,
1483                                    OopMapSet* oop_maps,
1484                                    int& frame_complete,
1485                                    int& stack_slots,
1486                                    int& interpreted_entry_offset,
1487                                    int& compiled_entry_offset) {
1488 
1489   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1490   int pos_cont_obj   = 0;
1491   int pos_is_cont    = 1;
1492   int pos_is_virtual = 2;
1493 
1494   // The platform-specific calling convention may present the arguments in various registers.
1495   // To simplify the rest of the code, we expect the arguments to reside at these known
1496   // registers, and we additionally check the placement here in case calling convention ever
1497   // changes.
1498   Register reg_cont_obj   = c_rarg1;
1499   Register reg_is_cont    = c_rarg2;
1500   Register reg_is_virtual = c_rarg3;
1501 
1502   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1503   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1504   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1505 
1506   // Utility methods kill rax, make sure there are no collisions
1507   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1508 
1509   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1510                          relocInfo::static_call_type);
1511 
1512   address start = __ pc();
1513 
1514   Label L_thaw, L_exit;
1515 
1516   // i2i entry used at interp_only_mode only
1517   interpreted_entry_offset = __ pc() - start;
1518   {
1519 #ifdef ASSERT
1520     Label is_interp_only;
1521     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1522     __ jcc(Assembler::notEqual, is_interp_only);
1523     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1524     __ bind(is_interp_only);
1525 #endif
1526 
1527     __ pop(rax); // return address
1528     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1529     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1530     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1531     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1532     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1533     __ push(rax); // return address
1534     __ push_cont_fastpath();
1535 
1536     __ enter();
1537 
1538     stack_slots = 2; // will be adjusted in setup
1539     OopMap* map = continuation_enter_setup(masm, stack_slots);
1540     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1541     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1542 
1543     __ verify_oop(reg_cont_obj);
1544 
1545     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1546 
1547     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1548     __ testptr(reg_is_cont, reg_is_cont);
1549     __ jcc(Assembler::notZero, L_thaw);
1550 
1551     // --- Resolve path
1552 
1553     // Make sure the call is patchable
1554     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1555     // Emit stub for static call
1556     address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1557     if (stub == nullptr) {
1558       fatal("CodeCache is full at gen_continuation_enter");
1559     }
1560     __ call(resolve);
1561     oop_maps->add_gc_map(__ pc() - start, map);
1562     __ post_call_nop();
1563 
1564     __ jmp(L_exit);
1565   }
1566 
1567   // compiled entry
1568   __ align(CodeEntryAlignment);
1569   compiled_entry_offset = __ pc() - start;
1570   __ enter();
1571 
1572   stack_slots = 2; // will be adjusted in setup
1573   OopMap* map = continuation_enter_setup(masm, stack_slots);
1574 
1575   // Frame is now completed as far as size and linkage.
1576   frame_complete = __ pc() - start;
1577 
1578   __ verify_oop(reg_cont_obj);
1579 
1580   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1581 
1582   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1583   __ testptr(reg_is_cont, reg_is_cont);
1584   __ jccb(Assembler::notZero, L_thaw);
1585 
1586   // --- call Continuation.enter(Continuation c, boolean isContinue)
1587 
1588   // Make sure the call is patchable
1589   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1590 
1591   // Emit stub for static call
1592   address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1593   if (stub == nullptr) {
1594     fatal("CodeCache is full at gen_continuation_enter");
1595   }
1596 
1597   // The call needs to be resolved. There's a special case for this in
1598   // SharedRuntime::find_callee_info_helper() which calls
1599   // LinkResolver::resolve_continuation_enter() which resolves the call to
1600   // Continuation.enter(Continuation c, boolean isContinue).
1601   __ call(resolve);
1602 
1603   oop_maps->add_gc_map(__ pc() - start, map);
1604   __ post_call_nop();
1605 
1606   __ jmpb(L_exit);
1607 
1608   // --- Thawing path
1609 
1610   __ bind(L_thaw);
1611 
1612   ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1613   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1614 
1615   ContinuationEntry::_return_pc_offset = __ pc() - start;
1616   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1617   __ post_call_nop();
1618 
1619   // --- Normal exit (resolve/thawing)
1620 
1621   __ bind(L_exit);
1622   ContinuationEntry::_cleanup_offset = __ pc() - start;
1623   continuation_enter_cleanup(masm);
1624   __ pop(rbp);
1625   __ ret(0);
1626 
1627   // --- Exception handling path
1628 
1629   exception_offset = __ pc() - start;
1630 
1631   continuation_enter_cleanup(masm);
1632   __ pop(rbp);
1633 
1634   __ movptr(c_rarg0, r15_thread);
1635   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1636 
1637   // rax still holds the original exception oop, save it before the call
1638   __ push(rax);
1639 
1640   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1641   __ movptr(rbx, rax);
1642 
1643   // Continue at exception handler:
1644   //   rax: exception oop
1645   //   rbx: exception handler
1646   //   rdx: exception pc
1647   __ pop(rax);
1648   __ verify_oop(rax);
1649   __ pop(rdx);
1650   __ jmp(rbx);
1651 }
1652 
1653 static void gen_continuation_yield(MacroAssembler* masm,
1654                                    const VMRegPair* regs,
1655                                    OopMapSet* oop_maps,
1656                                    int& frame_complete,
1657                                    int& stack_slots,
1658                                    int& compiled_entry_offset) {
1659   enum layout {
1660     rbp_off,
1661     rbpH_off,
1662     return_off,
1663     return_off2,
1664     framesize // inclusive of return address
1665   };
1666   stack_slots = framesize /  VMRegImpl::slots_per_word;
1667   assert(stack_slots == 2, "recheck layout");
1668 
1669   address start = __ pc();
1670   compiled_entry_offset = __ pc() - start;
1671   __ enter();
1672   address the_pc = __ pc();
1673 
1674   frame_complete = the_pc - start;
1675 
1676   // This nop must be exactly at the PC we push into the frame info.
1677   // We use this nop for fast CodeBlob lookup, associate the OopMap
1678   // with it right away.
1679   __ post_call_nop();
1680   OopMap* map = new OopMap(framesize, 1);
1681   oop_maps->add_gc_map(frame_complete, map);
1682 
1683   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1684   __ movptr(c_rarg0, r15_thread);
1685   __ movptr(c_rarg1, rsp);
1686   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1687   __ reset_last_Java_frame(true);
1688 
1689   Label L_pinned;
1690 
1691   __ testptr(rax, rax);
1692   __ jcc(Assembler::notZero, L_pinned);
1693 
1694   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1695   continuation_enter_cleanup(masm);
1696   __ pop(rbp);
1697   __ ret(0);
1698 
1699   __ bind(L_pinned);
1700 
1701   // Pinned, return to caller
1702 
1703   // handle pending exception thrown by freeze
1704   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1705   Label ok;
1706   __ jcc(Assembler::equal, ok);
1707   __ leave();
1708   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1709   __ bind(ok);
1710 
1711   __ leave();
1712   __ ret(0);
1713 }
1714 
1715 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1716   ::continuation_enter_cleanup(masm);
1717 }
1718 
1719 static void gen_special_dispatch(MacroAssembler* masm,
1720                                  const methodHandle& method,
1721                                  const BasicType* sig_bt,
1722                                  const VMRegPair* regs) {
1723   verify_oop_args(masm, method, sig_bt, regs);
1724   vmIntrinsics::ID iid = method->intrinsic_id();
1725 
1726   // Now write the args into the outgoing interpreter space
1727   bool     has_receiver   = false;
1728   Register receiver_reg   = noreg;
1729   int      member_arg_pos = -1;
1730   Register member_reg     = noreg;
1731   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1732   if (ref_kind != 0) {
1733     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1734     member_reg = rbx;  // known to be free at this point
1735     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1736   } else if (iid == vmIntrinsics::_invokeBasic) {
1737     has_receiver = true;
1738   } else if (iid == vmIntrinsics::_linkToNative) {
1739     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1740     member_reg = rbx;  // known to be free at this point
1741   } else {
1742     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1743   }
1744 
1745   if (member_reg != noreg) {
1746     // Load the member_arg into register, if necessary.
1747     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1748     VMReg r = regs[member_arg_pos].first();
1749     if (r->is_stack()) {
1750       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1751     } else {
1752       // no data motion is needed
1753       member_reg = r->as_Register();
1754     }
1755   }
1756 
1757   if (has_receiver) {
1758     // Make sure the receiver is loaded into a register.
1759     assert(method->size_of_parameters() > 0, "oob");
1760     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1761     VMReg r = regs[0].first();
1762     assert(r->is_valid(), "bad receiver arg");
1763     if (r->is_stack()) {
1764       // Porting note:  This assumes that compiled calling conventions always
1765       // pass the receiver oop in a register.  If this is not true on some
1766       // platform, pick a temp and load the receiver from stack.
1767       fatal("receiver always in a register");
1768       receiver_reg = j_rarg0;  // known to be free at this point
1769       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1770     } else {
1771       // no data motion is needed
1772       receiver_reg = r->as_Register();
1773     }
1774   }
1775 
1776   // Figure out which address we are really jumping to:
1777   MethodHandles::generate_method_handle_dispatch(masm, iid,
1778                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1779 }
1780 
1781 // ---------------------------------------------------------------------------
1782 // Generate a native wrapper for a given method.  The method takes arguments
1783 // in the Java compiled code convention, marshals them to the native
1784 // convention (handlizes oops, etc), transitions to native, makes the call,
1785 // returns to java state (possibly blocking), unhandlizes any result and
1786 // returns.
1787 //
1788 // Critical native functions are a shorthand for the use of
1789 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1790 // functions.  The wrapper is expected to unpack the arguments before
1791 // passing them to the callee. Critical native functions leave the state _in_Java,
1792 // since they cannot stop for GC.
1793 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1794 // block and the check for pending exceptions it's impossible for them
1795 // to be thrown.
1796 //
1797 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1798                                                 const methodHandle& method,
1799                                                 int compile_id,
1800                                                 BasicType* in_sig_bt,
1801                                                 VMRegPair* in_regs,
1802                                                 BasicType ret_type) {
1803   if (method->is_continuation_native_intrinsic()) {
1804     int exception_offset = -1;
1805     OopMapSet* oop_maps = new OopMapSet();
1806     int frame_complete = -1;
1807     int stack_slots = -1;
1808     int interpreted_entry_offset = -1;
1809     int vep_offset = -1;
1810     if (method->is_continuation_enter_intrinsic()) {
1811       gen_continuation_enter(masm,
1812                              in_regs,
1813                              exception_offset,
1814                              oop_maps,
1815                              frame_complete,
1816                              stack_slots,
1817                              interpreted_entry_offset,
1818                              vep_offset);
1819     } else if (method->is_continuation_yield_intrinsic()) {
1820       gen_continuation_yield(masm,
1821                              in_regs,
1822                              oop_maps,
1823                              frame_complete,
1824                              stack_slots,
1825                              vep_offset);
1826     } else {
1827       guarantee(false, "Unknown Continuation native intrinsic");
1828     }
1829 
1830 #ifdef ASSERT
1831     if (method->is_continuation_enter_intrinsic()) {
1832       assert(interpreted_entry_offset != -1, "Must be set");
1833       assert(exception_offset != -1,         "Must be set");
1834     } else {
1835       assert(interpreted_entry_offset == -1, "Must be unset");
1836       assert(exception_offset == -1,         "Must be unset");
1837     }
1838     assert(frame_complete != -1,    "Must be set");
1839     assert(stack_slots != -1,       "Must be set");
1840     assert(vep_offset != -1,        "Must be set");
1841 #endif
1842 
1843     __ flush();
1844     nmethod* nm = nmethod::new_native_nmethod(method,
1845                                               compile_id,
1846                                               masm->code(),
1847                                               vep_offset,
1848                                               frame_complete,
1849                                               stack_slots,
1850                                               in_ByteSize(-1),
1851                                               in_ByteSize(-1),
1852                                               oop_maps,
1853                                               exception_offset);
1854     if (nm == nullptr) return nm;
1855     if (method->is_continuation_enter_intrinsic()) {
1856       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1857     } else if (method->is_continuation_yield_intrinsic()) {
1858       _cont_doYield_stub = nm;
1859     }
1860     return nm;
1861   }
1862 
1863   if (method->is_method_handle_intrinsic()) {
1864     vmIntrinsics::ID iid = method->intrinsic_id();
1865     intptr_t start = (intptr_t)__ pc();
1866     int vep_offset = ((intptr_t)__ pc()) - start;
1867     gen_special_dispatch(masm,
1868                          method,
1869                          in_sig_bt,
1870                          in_regs);
1871     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1872     __ flush();
1873     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1874     return nmethod::new_native_nmethod(method,
1875                                        compile_id,
1876                                        masm->code(),
1877                                        vep_offset,
1878                                        frame_complete,
1879                                        stack_slots / VMRegImpl::slots_per_word,
1880                                        in_ByteSize(-1),
1881                                        in_ByteSize(-1),
1882                                        nullptr);
1883   }
1884   address native_func = method->native_function();
1885   assert(native_func != nullptr, "must have function");
1886 
1887   // An OopMap for lock (and class if static)
1888   OopMapSet *oop_maps = new OopMapSet();
1889   intptr_t start = (intptr_t)__ pc();
1890 
1891   // We have received a description of where all the java arg are located
1892   // on entry to the wrapper. We need to convert these args to where
1893   // the jni function will expect them. To figure out where they go
1894   // we convert the java signature to a C signature by inserting
1895   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1896 
1897   const int total_in_args = method->size_of_parameters();
1898   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1899 
1900   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1901   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1902 
1903   int argc = 0;
1904   out_sig_bt[argc++] = T_ADDRESS;
1905   if (method->is_static()) {
1906     out_sig_bt[argc++] = T_OBJECT;
1907   }
1908 
1909   for (int i = 0; i < total_in_args ; i++ ) {
1910     out_sig_bt[argc++] = in_sig_bt[i];
1911   }
1912 
1913   // Now figure out where the args must be stored and how much stack space
1914   // they require.
1915   int out_arg_slots;
1916   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
1917 
1918   // Compute framesize for the wrapper.  We need to handlize all oops in
1919   // incoming registers
1920 
1921   // Calculate the total number of stack slots we will need.
1922 
1923   // First count the abi requirement plus all of the outgoing args
1924   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1925 
1926   // Now the space for the inbound oop handle area
1927   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1928 
1929   int oop_handle_offset = stack_slots;
1930   stack_slots += total_save_slots;
1931 
1932   // Now any space we need for handlizing a klass if static method
1933 
1934   int klass_slot_offset = 0;
1935   int klass_offset = -1;
1936   int lock_slot_offset = 0;
1937   bool is_static = false;
1938 
1939   if (method->is_static()) {
1940     klass_slot_offset = stack_slots;
1941     stack_slots += VMRegImpl::slots_per_word;
1942     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1943     is_static = true;
1944   }
1945 
1946   // Plus a lock if needed
1947 
1948   if (method->is_synchronized()) {
1949     lock_slot_offset = stack_slots;
1950     stack_slots += VMRegImpl::slots_per_word;
1951   }
1952 
1953   // Now a place (+2) to save return values or temp during shuffling
1954   // + 4 for return address (which we own) and saved rbp
1955   stack_slots += 6;
1956 
1957   // Ok The space we have allocated will look like:
1958   //
1959   //
1960   // FP-> |                     |
1961   //      |---------------------|
1962   //      | 2 slots for moves   |
1963   //      |---------------------|
1964   //      | lock box (if sync)  |
1965   //      |---------------------| <- lock_slot_offset
1966   //      | klass (if static)   |
1967   //      |---------------------| <- klass_slot_offset
1968   //      | oopHandle area      |
1969   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1970   //      | outbound memory     |
1971   //      | based arguments     |
1972   //      |                     |
1973   //      |---------------------|
1974   //      |                     |
1975   // SP-> | out_preserved_slots |
1976   //
1977   //
1978 
1979 
1980   // Now compute actual number of stack words we need rounding to make
1981   // stack properly aligned.
1982   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1983 
1984   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1985 
1986   // First thing make an ic check to see if we should even be here
1987 
1988   // We are free to use all registers as temps without saving them and
1989   // restoring them except rbp. rbp is the only callee save register
1990   // as far as the interpreter and the compiler(s) are concerned.
1991 
1992   const Register receiver = j_rarg0;
1993 
1994   Label exception_pending;
1995 
1996   assert_different_registers(receiver, rscratch1, rscratch2);
1997   __ verify_oop(receiver);
1998   __ ic_check(8 /* end_alignment */);
1999 
2000   int vep_offset = ((intptr_t)__ pc()) - start;
2001 
2002   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
2003     Label L_skip_barrier;
2004     Register klass = r10;
2005     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
2006     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
2007 
2008     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
2009 
2010     __ bind(L_skip_barrier);
2011   }
2012 
2013 #ifdef COMPILER1
2014   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2015   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2016     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2017   }
2018 #endif // COMPILER1
2019 
2020   // The instruction at the verified entry point must be 5 bytes or longer
2021   // because it can be patched on the fly by make_non_entrant. The stack bang
2022   // instruction fits that requirement.
2023 
2024   // Generate stack overflow check
2025   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
2026 
2027   // Generate a new frame for the wrapper.
2028   __ enter();
2029   // -2 because return address is already present and so is saved rbp
2030   __ subptr(rsp, stack_size - 2*wordSize);
2031 
2032   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2033   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
2034   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
2035 
2036   // Frame is now completed as far as size and linkage.
2037   int frame_complete = ((intptr_t)__ pc()) - start;
2038 
2039 #ifdef ASSERT
2040   __ check_stack_alignment(rsp, "improperly aligned stack");
2041 #endif /* ASSERT */
2042 
2043 
2044   // We use r14 as the oop handle for the receiver/klass
2045   // It is callee save so it survives the call to native
2046 
2047   const Register oop_handle_reg = r14;
2048 
2049   //
2050   // We immediately shuffle the arguments so that any vm call we have to
2051   // make from here on out (sync slow path, jvmti, etc.) we will have
2052   // captured the oops from our caller and have a valid oopMap for
2053   // them.
2054 
2055   // -----------------
2056   // The Grand Shuffle
2057 
2058   // The Java calling convention is either equal (linux) or denser (win64) than the
2059   // c calling convention. However the because of the jni_env argument the c calling
2060   // convention always has at least one more (and two for static) arguments than Java.
2061   // Therefore if we move the args from java -> c backwards then we will never have
2062   // a register->register conflict and we don't have to build a dependency graph
2063   // and figure out how to break any cycles.
2064   //
2065 
2066   // Record esp-based slot for receiver on stack for non-static methods
2067   int receiver_offset = -1;
2068 
2069   // This is a trick. We double the stack slots so we can claim
2070   // the oops in the caller's frame. Since we are sure to have
2071   // more args than the caller doubling is enough to make
2072   // sure we can capture all the incoming oop args from the
2073   // caller.
2074   //
2075   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2076 
2077   // Mark location of rbp (someday)
2078   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2079 
2080   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2081   // All inbound args are referenced based on rbp and all outbound args via rsp.
2082 
2083 
2084 #ifdef ASSERT
2085   bool reg_destroyed[Register::number_of_registers];
2086   bool freg_destroyed[XMMRegister::number_of_registers];
2087   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2088     reg_destroyed[r] = false;
2089   }
2090   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2091     freg_destroyed[f] = false;
2092   }
2093 
2094 #endif /* ASSERT */
2095 
2096   // For JNI natives the incoming and outgoing registers are offset upwards.
2097   GrowableArray<int> arg_order(2 * total_in_args);
2098 
2099   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2100     arg_order.push(i);
2101     arg_order.push(c_arg);
2102   }
2103 
2104   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2105     int i = arg_order.at(ai);
2106     int c_arg = arg_order.at(ai + 1);
2107     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2108 #ifdef ASSERT
2109     if (in_regs[i].first()->is_Register()) {
2110       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2111     } else if (in_regs[i].first()->is_XMMRegister()) {
2112       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2113     }
2114     if (out_regs[c_arg].first()->is_Register()) {
2115       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2116     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2117       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2118     }
2119 #endif /* ASSERT */
2120     switch (in_sig_bt[i]) {
2121       case T_ARRAY:
2122       case T_OBJECT:
2123         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2124                     ((i == 0) && (!is_static)),
2125                     &receiver_offset);
2126         break;
2127       case T_VOID:
2128         break;
2129 
2130       case T_FLOAT:
2131         __ float_move(in_regs[i], out_regs[c_arg]);
2132           break;
2133 
2134       case T_DOUBLE:
2135         assert( i + 1 < total_in_args &&
2136                 in_sig_bt[i + 1] == T_VOID &&
2137                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2138         __ double_move(in_regs[i], out_regs[c_arg]);
2139         break;
2140 
2141       case T_LONG :
2142         __ long_move(in_regs[i], out_regs[c_arg]);
2143         break;
2144 
2145       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2146 
2147       default:
2148         __ move32_64(in_regs[i], out_regs[c_arg]);
2149     }
2150   }
2151 
2152   int c_arg;
2153 
2154   // Pre-load a static method's oop into r14.  Used both by locking code and
2155   // the normal JNI call code.
2156   // point c_arg at the first arg that is already loaded in case we
2157   // need to spill before we call out
2158   c_arg = total_c_args - total_in_args;
2159 
2160   if (method->is_static()) {
2161 
2162     //  load oop into a register
2163     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2164 
2165     // Now handlize the static class mirror it's known not-null.
2166     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2167     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2168 
2169     // Now get the handle
2170     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2171     // store the klass handle as second argument
2172     __ movptr(c_rarg1, oop_handle_reg);
2173     // and protect the arg if we must spill
2174     c_arg--;
2175   }
2176 
2177   // Change state to native (we save the return address in the thread, since it might not
2178   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2179   // points into the right code segment. It does not have to be the correct return pc.
2180   // We use the same pc/oopMap repeatedly when we call out
2181 
2182   Label native_return;
2183   if (LockingMode != LM_LEGACY && method->is_object_wait0()) {
2184     // For convenience we use the pc we want to resume to in case of preemption on Object.wait.
2185     __ set_last_Java_frame(rsp, noreg, native_return, rscratch1);
2186   } else {
2187     intptr_t the_pc = (intptr_t) __ pc();
2188     oop_maps->add_gc_map(the_pc - start, map);
2189 
2190     __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1);
2191   }
2192 
2193   // We have all of the arguments setup at this point. We must not touch any register
2194   // argument registers at this point (what if we save/restore them there are no oop?
2195 
2196   if (DTraceMethodProbes) {
2197     // protect the args we've loaded
2198     save_args(masm, total_c_args, c_arg, out_regs);
2199     __ mov_metadata(c_rarg1, method());
2200     __ call_VM_leaf(
2201       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2202       r15_thread, c_rarg1);
2203     restore_args(masm, total_c_args, c_arg, out_regs);
2204   }
2205 
2206   // RedefineClasses() tracing support for obsolete method entry
2207   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2208     // protect the args we've loaded
2209     save_args(masm, total_c_args, c_arg, out_regs);
2210     __ mov_metadata(c_rarg1, method());
2211     __ call_VM_leaf(
2212       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2213       r15_thread, c_rarg1);
2214     restore_args(masm, total_c_args, c_arg, out_regs);
2215   }
2216 
2217   // Lock a synchronized method
2218 
2219   // Register definitions used by locking and unlocking
2220 
2221   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2222   const Register obj_reg  = rbx;  // Will contain the oop
2223   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2224   const Register old_hdr  = r13;  // value of old header at unlock time
2225 
2226   Label slow_path_lock;
2227   Label lock_done;
2228 
2229   if (method->is_synchronized()) {
2230     Label count_mon;
2231 
2232     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2233 
2234     // Get the handle (the 2nd argument)
2235     __ mov(oop_handle_reg, c_rarg1);
2236 
2237     // Get address of the box
2238 
2239     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2240 
2241     // Load the oop from the handle
2242     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2243 
2244     if (LockingMode == LM_MONITOR) {
2245       __ jmp(slow_path_lock);
2246     } else if (LockingMode == LM_LEGACY) {
2247       // Load immediate 1 into swap_reg %rax
2248       __ movl(swap_reg, 1);
2249 
2250       // Load (object->mark() | 1) into swap_reg %rax
2251       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2252 
2253       // Save (object->mark() | 1) into BasicLock's displaced header
2254       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2255 
2256       // src -> dest iff dest == rax else rax <- dest
2257       __ lock();
2258       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2259       __ jcc(Assembler::equal, count_mon);
2260 
2261       // Hmm should this move to the slow path code area???
2262 
2263       // Test if the oopMark is an obvious stack pointer, i.e.,
2264       //  1) (mark & 3) == 0, and
2265       //  2) rsp <= mark < mark + os::pagesize()
2266       // These 3 tests can be done by evaluating the following
2267       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2268       // assuming both stack pointer and pagesize have their
2269       // least significant 2 bits clear.
2270       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2271 
2272       __ subptr(swap_reg, rsp);
2273       __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2274 
2275       // Save the test result, for recursive case, the result is zero
2276       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2277       __ jcc(Assembler::notEqual, slow_path_lock);
2278 
2279       __ bind(count_mon);
2280       __ inc_held_monitor_count();
2281     } else {
2282       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2283       __ lightweight_lock(lock_reg, obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock);
2284     }
2285 
2286     // Slow path will re-enter here
2287     __ bind(lock_done);
2288   }
2289 
2290   // Finally just about ready to make the JNI call
2291 
2292   // get JNIEnv* which is first argument to native
2293   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2294 
2295   // Now set thread in native
2296   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2297 
2298   __ call(RuntimeAddress(native_func));
2299 
2300   // Verify or restore cpu control state after JNI call
2301   __ restore_cpu_control_state_after_jni(rscratch1);
2302 
2303   // Unpack native results.
2304   switch (ret_type) {
2305   case T_BOOLEAN: __ c2bool(rax);            break;
2306   case T_CHAR   : __ movzwl(rax, rax);      break;
2307   case T_BYTE   : __ sign_extend_byte (rax); break;
2308   case T_SHORT  : __ sign_extend_short(rax); break;
2309   case T_INT    : /* nothing to do */        break;
2310   case T_DOUBLE :
2311   case T_FLOAT  :
2312     // Result is in xmm0 we'll save as needed
2313     break;
2314   case T_ARRAY:                 // Really a handle
2315   case T_OBJECT:                // Really a handle
2316       break; // can't de-handlize until after safepoint check
2317   case T_VOID: break;
2318   case T_LONG: break;
2319   default       : ShouldNotReachHere();
2320   }
2321 
2322   // Switch thread to "native transition" state before reading the synchronization state.
2323   // This additional state is necessary because reading and testing the synchronization
2324   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2325   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2326   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2327   //     Thread A is resumed to finish this native method, but doesn't block here since it
2328   //     didn't see any synchronization is progress, and escapes.
2329   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2330 
2331   // Force this write out before the read below
2332   if (!UseSystemMemoryBarrier) {
2333     __ membar(Assembler::Membar_mask_bits(
2334               Assembler::LoadLoad | Assembler::LoadStore |
2335               Assembler::StoreLoad | Assembler::StoreStore));
2336   }
2337 
2338   // check for safepoint operation in progress and/or pending suspend requests
2339   {
2340     Label Continue;
2341     Label slow_path;
2342 
2343     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2344 
2345     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2346     __ jcc(Assembler::equal, Continue);
2347     __ bind(slow_path);
2348 
2349     // Don't use call_VM as it will see a possible pending exception and forward it
2350     // and never return here preventing us from clearing _last_native_pc down below.
2351     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2352     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2353     // by hand.
2354     //
2355     __ vzeroupper();
2356     save_native_result(masm, ret_type, stack_slots);
2357     __ mov(c_rarg0, r15_thread);
2358     __ mov(r12, rsp); // remember sp
2359     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2360     __ andptr(rsp, -16); // align stack as required by ABI
2361     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2362     __ mov(rsp, r12); // restore sp
2363     __ reinit_heapbase();
2364     // Restore any method result value
2365     restore_native_result(masm, ret_type, stack_slots);
2366     __ bind(Continue);
2367   }
2368 
2369   // change thread state
2370   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2371 
2372   if (LockingMode != LM_LEGACY && method->is_object_wait0()) {
2373     // Check preemption for Object.wait()
2374     __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset()));
2375     __ cmpptr(rscratch1, NULL_WORD);
2376     __ jccb(Assembler::equal, native_return);
2377     __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD);
2378     __ jmp(rscratch1);
2379     __ bind(native_return);
2380 
2381     intptr_t the_pc = (intptr_t) __ pc();
2382     oop_maps->add_gc_map(the_pc - start, map);
2383   }
2384 
2385 
2386   Label reguard;
2387   Label reguard_done;
2388   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2389   __ jcc(Assembler::equal, reguard);
2390   __ bind(reguard_done);
2391 
2392   // native result if any is live
2393 
2394   // Unlock
2395   Label slow_path_unlock;
2396   Label unlock_done;
2397   if (method->is_synchronized()) {
2398 
2399     Label fast_done;
2400 
2401     // Get locked oop from the handle we passed to jni
2402     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2403 
2404     if (LockingMode == LM_LEGACY) {
2405       Label not_recur;
2406       // Simple recursive lock?
2407       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2408       __ jcc(Assembler::notEqual, not_recur);
2409       __ dec_held_monitor_count();
2410       __ jmpb(fast_done);
2411       __ bind(not_recur);
2412     }
2413 
2414     // Must save rax if it is live now because cmpxchg must use it
2415     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2416       save_native_result(masm, ret_type, stack_slots);
2417     }
2418 
2419     if (LockingMode == LM_MONITOR) {
2420       __ jmp(slow_path_unlock);
2421     } else if (LockingMode == LM_LEGACY) {
2422       // get address of the stack lock
2423       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2424       //  get old displaced header
2425       __ movptr(old_hdr, Address(rax, 0));
2426 
2427       // Atomic swap old header if oop still contains the stack lock
2428       __ lock();
2429       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2430       __ jcc(Assembler::notEqual, slow_path_unlock);
2431       __ dec_held_monitor_count();
2432     } else {
2433       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2434       __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock);
2435     }
2436 
2437     // slow path re-enters here
2438     __ bind(unlock_done);
2439     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2440       restore_native_result(masm, ret_type, stack_slots);
2441     }
2442 
2443     __ bind(fast_done);
2444   }
2445   if (DTraceMethodProbes) {
2446     save_native_result(masm, ret_type, stack_slots);
2447     __ mov_metadata(c_rarg1, method());
2448     __ call_VM_leaf(
2449          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2450          r15_thread, c_rarg1);
2451     restore_native_result(masm, ret_type, stack_slots);
2452   }
2453 
2454   __ reset_last_Java_frame(false);
2455 
2456   // Unbox oop result, e.g. JNIHandles::resolve value.
2457   if (is_reference_type(ret_type)) {
2458     __ resolve_jobject(rax /* value */,
2459                        r15_thread /* thread */,
2460                        rcx /* tmp */);
2461   }
2462 
2463   if (CheckJNICalls) {
2464     // clear_pending_jni_exception_check
2465     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2466   }
2467 
2468   // reset handle block
2469   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2470   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2471 
2472   // pop our frame
2473 
2474   __ leave();
2475 
2476   // Any exception pending?
2477   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2478   __ jcc(Assembler::notEqual, exception_pending);
2479 
2480   // Return
2481 
2482   __ ret(0);
2483 
2484   // Unexpected paths are out of line and go here
2485 
2486   // forward the exception
2487   __ bind(exception_pending);
2488 
2489   // and forward the exception
2490   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2491 
2492   // Slow path locking & unlocking
2493   if (method->is_synchronized()) {
2494 
2495     // BEGIN Slow path lock
2496     __ bind(slow_path_lock);
2497 
2498     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2499     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2500 
2501     // protect the args we've loaded
2502     save_args(masm, total_c_args, c_arg, out_regs);
2503 
2504     __ mov(c_rarg0, obj_reg);
2505     __ mov(c_rarg1, lock_reg);
2506     __ mov(c_rarg2, r15_thread);
2507 
2508     // Not a leaf but we have last_Java_frame setup as we want.
2509     // We don't want to unmount in case of contention since that would complicate preserving
2510     // the arguments that had already been marshalled into the native convention. So we force
2511     // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame())
2512     // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack.
2513     __ push_cont_fastpath();
2514     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2515     __ pop_cont_fastpath();
2516     restore_args(masm, total_c_args, c_arg, out_regs);
2517 
2518 #ifdef ASSERT
2519     { Label L;
2520     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2521     __ jcc(Assembler::equal, L);
2522     __ stop("no pending exception allowed on exit from monitorenter");
2523     __ bind(L);
2524     }
2525 #endif
2526     __ jmp(lock_done);
2527 
2528     // END Slow path lock
2529 
2530     // BEGIN Slow path unlock
2531     __ bind(slow_path_unlock);
2532 
2533     // If we haven't already saved the native result we must save it now as xmm registers
2534     // are still exposed.
2535     __ vzeroupper();
2536     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2537       save_native_result(masm, ret_type, stack_slots);
2538     }
2539 
2540     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2541 
2542     __ mov(c_rarg0, obj_reg);
2543     __ mov(c_rarg2, r15_thread);
2544     __ mov(r12, rsp); // remember sp
2545     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2546     __ andptr(rsp, -16); // align stack as required by ABI
2547 
2548     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2549     // NOTE that obj_reg == rbx currently
2550     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2551     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2552 
2553     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2554     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2555     __ mov(rsp, r12); // restore sp
2556     __ reinit_heapbase();
2557 #ifdef ASSERT
2558     {
2559       Label L;
2560       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2561       __ jcc(Assembler::equal, L);
2562       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2563       __ bind(L);
2564     }
2565 #endif /* ASSERT */
2566 
2567     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2568 
2569     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2570       restore_native_result(masm, ret_type, stack_slots);
2571     }
2572     __ jmp(unlock_done);
2573 
2574     // END Slow path unlock
2575 
2576   } // synchronized
2577 
2578   // SLOW PATH Reguard the stack if needed
2579 
2580   __ bind(reguard);
2581   __ vzeroupper();
2582   save_native_result(masm, ret_type, stack_slots);
2583   __ mov(r12, rsp); // remember sp
2584   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2585   __ andptr(rsp, -16); // align stack as required by ABI
2586   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2587   __ mov(rsp, r12); // restore sp
2588   __ reinit_heapbase();
2589   restore_native_result(masm, ret_type, stack_slots);
2590   // and continue
2591   __ jmp(reguard_done);
2592 
2593 
2594 
2595   __ flush();
2596 
2597   nmethod *nm = nmethod::new_native_nmethod(method,
2598                                             compile_id,
2599                                             masm->code(),
2600                                             vep_offset,
2601                                             frame_complete,
2602                                             stack_slots / VMRegImpl::slots_per_word,
2603                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2604                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2605                                             oop_maps);
2606 
2607   return nm;
2608 }
2609 
2610 // this function returns the adjust size (in number of words) to a c2i adapter
2611 // activation for use during deoptimization
2612 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2613   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2614 }
2615 
2616 
2617 uint SharedRuntime::out_preserve_stack_slots() {
2618   return 0;
2619 }
2620 
2621 
2622 // Number of stack slots between incoming argument block and the start of
2623 // a new frame.  The PROLOG must add this many slots to the stack.  The
2624 // EPILOG must remove this many slots.  amd64 needs two slots for
2625 // return address.
2626 uint SharedRuntime::in_preserve_stack_slots() {
2627   return 4 + 2 * VerifyStackAtCalls;
2628 }
2629 
2630 VMReg SharedRuntime::thread_register() {
2631   return r15_thread->as_VMReg();
2632 }
2633 
2634 //------------------------------generate_deopt_blob----------------------------
2635 void SharedRuntime::generate_deopt_blob() {
2636   // Allocate space for the code
2637   ResourceMark rm;
2638   // Setup code generation tools
2639   int pad = 0;
2640   if (UseAVX > 2) {
2641     pad += 1024;
2642   }
2643   if (UseAPX) {
2644     pad += 1024;
2645   }
2646 #if INCLUDE_JVMCI
2647   if (EnableJVMCI) {
2648     pad += 512; // Increase the buffer size when compiling for JVMCI
2649   }
2650 #endif
2651   const char* name = SharedRuntime::stub_name(SharedStubId::deopt_id);
2652   CodeBuffer buffer(name, 2560+pad, 1024);
2653   MacroAssembler* masm = new MacroAssembler(&buffer);
2654   int frame_size_in_words;
2655   OopMap* map = nullptr;
2656   OopMapSet *oop_maps = new OopMapSet();
2657 
2658   // -------------
2659   // This code enters when returning to a de-optimized nmethod.  A return
2660   // address has been pushed on the stack, and return values are in
2661   // registers.
2662   // If we are doing a normal deopt then we were called from the patched
2663   // nmethod from the point we returned to the nmethod. So the return
2664   // address on the stack is wrong by NativeCall::instruction_size
2665   // We will adjust the value so it looks like we have the original return
2666   // address on the stack (like when we eagerly deoptimized).
2667   // In the case of an exception pending when deoptimizing, we enter
2668   // with a return address on the stack that points after the call we patched
2669   // into the exception handler. We have the following register state from,
2670   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2671   //    rax: exception oop
2672   //    rbx: exception handler
2673   //    rdx: throwing pc
2674   // So in this case we simply jam rdx into the useless return address and
2675   // the stack looks just like we want.
2676   //
2677   // At this point we need to de-opt.  We save the argument return
2678   // registers.  We call the first C routine, fetch_unroll_info().  This
2679   // routine captures the return values and returns a structure which
2680   // describes the current frame size and the sizes of all replacement frames.
2681   // The current frame is compiled code and may contain many inlined
2682   // functions, each with their own JVM state.  We pop the current frame, then
2683   // push all the new frames.  Then we call the C routine unpack_frames() to
2684   // populate these frames.  Finally unpack_frames() returns us the new target
2685   // address.  Notice that callee-save registers are BLOWN here; they have
2686   // already been captured in the vframeArray at the time the return PC was
2687   // patched.
2688   address start = __ pc();
2689   Label cont;
2690 
2691   // Prolog for non exception case!
2692 
2693   // Save everything in sight.
2694   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2695 
2696   // Normal deoptimization.  Save exec mode for unpack_frames.
2697   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2698   __ jmp(cont);
2699 
2700   int reexecute_offset = __ pc() - start;
2701 #if INCLUDE_JVMCI && !defined(COMPILER1)
2702   if (UseJVMCICompiler) {
2703     // JVMCI does not use this kind of deoptimization
2704     __ should_not_reach_here();
2705   }
2706 #endif
2707 
2708   // Reexecute case
2709   // return address is the pc describes what bci to do re-execute at
2710 
2711   // No need to update map as each call to save_live_registers will produce identical oopmap
2712   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2713 
2714   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2715   __ jmp(cont);
2716 
2717 #if INCLUDE_JVMCI
2718   Label after_fetch_unroll_info_call;
2719   int implicit_exception_uncommon_trap_offset = 0;
2720   int uncommon_trap_offset = 0;
2721 
2722   if (EnableJVMCI) {
2723     implicit_exception_uncommon_trap_offset = __ pc() - start;
2724 
2725     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2726     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2727 
2728     uncommon_trap_offset = __ pc() - start;
2729 
2730     // Save everything in sight.
2731     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2732     // fetch_unroll_info needs to call last_java_frame()
2733     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2734 
2735     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2736     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2737 
2738     __ movl(r14, Deoptimization::Unpack_reexecute);
2739     __ mov(c_rarg0, r15_thread);
2740     __ movl(c_rarg2, r14); // exec mode
2741     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2742     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2743 
2744     __ reset_last_Java_frame(false);
2745 
2746     __ jmp(after_fetch_unroll_info_call);
2747   } // EnableJVMCI
2748 #endif // INCLUDE_JVMCI
2749 
2750   int exception_offset = __ pc() - start;
2751 
2752   // Prolog for exception case
2753 
2754   // all registers are dead at this entry point, except for rax, and
2755   // rdx which contain the exception oop and exception pc
2756   // respectively.  Set them in TLS and fall thru to the
2757   // unpack_with_exception_in_tls entry point.
2758 
2759   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2760   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2761 
2762   int exception_in_tls_offset = __ pc() - start;
2763 
2764   // new implementation because exception oop is now passed in JavaThread
2765 
2766   // Prolog for exception case
2767   // All registers must be preserved because they might be used by LinearScan
2768   // Exceptiop oop and throwing PC are passed in JavaThread
2769   // tos: stack at point of call to method that threw the exception (i.e. only
2770   // args are on the stack, no return address)
2771 
2772   // make room on stack for the return address
2773   // It will be patched later with the throwing pc. The correct value is not
2774   // available now because loading it from memory would destroy registers.
2775   __ push(0);
2776 
2777   // Save everything in sight.
2778   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2779 
2780   // Now it is safe to overwrite any register
2781 
2782   // Deopt during an exception.  Save exec mode for unpack_frames.
2783   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2784 
2785   // load throwing pc from JavaThread and patch it as the return address
2786   // of the current frame. Then clear the field in JavaThread
2787 
2788   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2789   __ movptr(Address(rbp, wordSize), rdx);
2790   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2791 
2792 #ifdef ASSERT
2793   // verify that there is really an exception oop in JavaThread
2794   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2795   __ verify_oop(rax);
2796 
2797   // verify that there is no pending exception
2798   Label no_pending_exception;
2799   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2800   __ testptr(rax, rax);
2801   __ jcc(Assembler::zero, no_pending_exception);
2802   __ stop("must not have pending exception here");
2803   __ bind(no_pending_exception);
2804 #endif
2805 
2806   __ bind(cont);
2807 
2808   // Call C code.  Need thread and this frame, but NOT official VM entry
2809   // crud.  We cannot block on this call, no GC can happen.
2810   //
2811   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2812 
2813   // fetch_unroll_info needs to call last_java_frame().
2814 
2815   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2816 #ifdef ASSERT
2817   { Label L;
2818     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2819     __ jcc(Assembler::equal, L);
2820     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2821     __ bind(L);
2822   }
2823 #endif // ASSERT
2824   __ mov(c_rarg0, r15_thread);
2825   __ movl(c_rarg1, r14); // exec_mode
2826   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2827 
2828   // Need to have an oopmap that tells fetch_unroll_info where to
2829   // find any register it might need.
2830   oop_maps->add_gc_map(__ pc() - start, map);
2831 
2832   __ reset_last_Java_frame(false);
2833 
2834 #if INCLUDE_JVMCI
2835   if (EnableJVMCI) {
2836     __ bind(after_fetch_unroll_info_call);
2837   }
2838 #endif
2839 
2840   // Load UnrollBlock* into rdi
2841   __ mov(rdi, rax);
2842 
2843   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2844    Label noException;
2845   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2846   __ jcc(Assembler::notEqual, noException);
2847   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2848   // QQQ this is useless it was null above
2849   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2850   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2851   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2852 
2853   __ verify_oop(rax);
2854 
2855   // Overwrite the result registers with the exception results.
2856   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2857   // I think this is useless
2858   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2859 
2860   __ bind(noException);
2861 
2862   // Only register save data is on the stack.
2863   // Now restore the result registers.  Everything else is either dead
2864   // or captured in the vframeArray.
2865   RegisterSaver::restore_result_registers(masm);
2866 
2867   // All of the register save area has been popped of the stack. Only the
2868   // return address remains.
2869 
2870   // Pop all the frames we must move/replace.
2871   //
2872   // Frame picture (youngest to oldest)
2873   // 1: self-frame (no frame link)
2874   // 2: deopting frame  (no frame link)
2875   // 3: caller of deopting frame (could be compiled/interpreted).
2876   //
2877   // Note: by leaving the return address of self-frame on the stack
2878   // and using the size of frame 2 to adjust the stack
2879   // when we are done the return to frame 3 will still be on the stack.
2880 
2881   // Pop deoptimized frame
2882   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2883   __ addptr(rsp, rcx);
2884 
2885   // rsp should be pointing at the return address to the caller (3)
2886 
2887   // Pick up the initial fp we should save
2888   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2889   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2890 
2891 #ifdef ASSERT
2892   // Compilers generate code that bang the stack by as much as the
2893   // interpreter would need. So this stack banging should never
2894   // trigger a fault. Verify that it does not on non product builds.
2895   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2896   __ bang_stack_size(rbx, rcx);
2897 #endif
2898 
2899   // Load address of array of frame pcs into rcx
2900   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2901 
2902   // Trash the old pc
2903   __ addptr(rsp, wordSize);
2904 
2905   // Load address of array of frame sizes into rsi
2906   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2907 
2908   // Load counter into rdx
2909   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2910 
2911   // Now adjust the caller's stack to make up for the extra locals
2912   // but record the original sp so that we can save it in the skeletal interpreter
2913   // frame and the stack walking of interpreter_sender will get the unextended sp
2914   // value and not the "real" sp value.
2915 
2916   const Register sender_sp = r8;
2917 
2918   __ mov(sender_sp, rsp);
2919   __ movl(rbx, Address(rdi,
2920                        Deoptimization::UnrollBlock::
2921                        caller_adjustment_offset()));
2922   __ subptr(rsp, rbx);
2923 
2924   // Push interpreter frames in a loop
2925   Label loop;
2926   __ bind(loop);
2927   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2928   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2929   __ pushptr(Address(rcx, 0));          // Save return address
2930   __ enter();                           // Save old & set new ebp
2931   __ subptr(rsp, rbx);                  // Prolog
2932   // This value is corrected by layout_activation_impl
2933   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2934   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2935   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2936   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2937   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2938   __ decrementl(rdx);                   // Decrement counter
2939   __ jcc(Assembler::notZero, loop);
2940   __ pushptr(Address(rcx, 0));          // Save final return address
2941 
2942   // Re-push self-frame
2943   __ enter();                           // Save old & set new ebp
2944 
2945   // Allocate a full sized register save area.
2946   // Return address and rbp are in place, so we allocate two less words.
2947   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2948 
2949   // Restore frame locals after moving the frame
2950   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2951   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2952 
2953   // Call C code.  Need thread but NOT official VM entry
2954   // crud.  We cannot block on this call, no GC can happen.  Call should
2955   // restore return values to their stack-slots with the new SP.
2956   //
2957   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2958 
2959   // Use rbp because the frames look interpreted now
2960   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2961   // Don't need the precise return PC here, just precise enough to point into this code blob.
2962   address the_pc = __ pc();
2963   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2964 
2965   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2966   __ mov(c_rarg0, r15_thread);
2967   __ movl(c_rarg1, r14); // second arg: exec_mode
2968   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2969   // Revert SP alignment after call since we're going to do some SP relative addressing below
2970   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2971 
2972   // Set an oopmap for the call site
2973   // Use the same PC we used for the last java frame
2974   oop_maps->add_gc_map(the_pc - start,
2975                        new OopMap( frame_size_in_words, 0 ));
2976 
2977   // Clear fp AND pc
2978   __ reset_last_Java_frame(true);
2979 
2980   // Collect return values
2981   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2982   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2983   // I think this is useless (throwing pc?)
2984   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2985 
2986   // Pop self-frame.
2987   __ leave();                           // Epilog
2988 
2989   // Jump to interpreter
2990   __ ret(0);
2991 
2992   // Make sure all code is generated
2993   masm->flush();
2994 
2995   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2996   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2997 #if INCLUDE_JVMCI
2998   if (EnableJVMCI) {
2999     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
3000     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
3001   }
3002 #endif
3003 }
3004 
3005 //------------------------------generate_handler_blob------
3006 //
3007 // Generate a special Compile2Runtime blob that saves all registers,
3008 // and setup oopmap.
3009 //
3010 SafepointBlob* SharedRuntime::generate_handler_blob(SharedStubId id, address call_ptr) {
3011   assert(StubRoutines::forward_exception_entry() != nullptr,
3012          "must be generated before");
3013   assert(is_polling_page_id(id), "expected a polling page stub id");
3014 
3015   ResourceMark rm;
3016   OopMapSet *oop_maps = new OopMapSet();
3017   OopMap* map;
3018 
3019   // Allocate space for the code.  Setup code generation tools.
3020   const char* name = SharedRuntime::stub_name(id);
3021   CodeBuffer buffer(name, 2548, 1024);
3022   MacroAssembler* masm = new MacroAssembler(&buffer);
3023 
3024   address start   = __ pc();
3025   address call_pc = nullptr;
3026   int frame_size_in_words;
3027   bool cause_return = (id == SharedStubId::polling_page_return_handler_id);
3028   bool save_wide_vectors = (id == SharedStubId::polling_page_vectors_safepoint_handler_id);
3029 
3030   // Make room for return address (or push it again)
3031   if (!cause_return) {
3032     __ push(rbx);
3033   }
3034 
3035   // Save registers, fpu state, and flags
3036   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3037 
3038   // The following is basically a call_VM.  However, we need the precise
3039   // address of the call in order to generate an oopmap. Hence, we do all the
3040   // work ourselves.
3041 
3042   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3043 
3044   // The return address must always be correct so that frame constructor never
3045   // sees an invalid pc.
3046 
3047   if (!cause_return) {
3048     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3049     // Additionally, rbx is a callee saved register and we can look at it later to determine
3050     // if someone changed the return address for us!
3051     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3052     __ movptr(Address(rbp, wordSize), rbx);
3053   }
3054 
3055   // Do the call
3056   __ mov(c_rarg0, r15_thread);
3057   __ call(RuntimeAddress(call_ptr));
3058 
3059   // Set an oopmap for the call site.  This oopmap will map all
3060   // oop-registers and debug-info registers as callee-saved.  This
3061   // will allow deoptimization at this safepoint to find all possible
3062   // debug-info recordings, as well as let GC find all oops.
3063 
3064   oop_maps->add_gc_map( __ pc() - start, map);
3065 
3066   Label noException;
3067 
3068   __ reset_last_Java_frame(false);
3069 
3070   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3071   __ jcc(Assembler::equal, noException);
3072 
3073   // Exception pending
3074 
3075   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3076 
3077   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3078 
3079   // No exception case
3080   __ bind(noException);
3081 
3082   Label no_adjust;
3083 #ifdef ASSERT
3084   Label bail;
3085 #endif
3086   if (!cause_return) {
3087     Label no_prefix, not_special, check_rex_prefix;
3088 
3089     // If our stashed return pc was modified by the runtime we avoid touching it
3090     __ cmpptr(rbx, Address(rbp, wordSize));
3091     __ jcc(Assembler::notEqual, no_adjust);
3092 
3093     // Skip over the poll instruction.
3094     // See NativeInstruction::is_safepoint_poll()
3095     // Possible encodings:
3096     //      85 00       test   %eax,(%rax)
3097     //      85 01       test   %eax,(%rcx)
3098     //      85 02       test   %eax,(%rdx)
3099     //      85 03       test   %eax,(%rbx)
3100     //      85 06       test   %eax,(%rsi)
3101     //      85 07       test   %eax,(%rdi)
3102     //
3103     //   41 85 00       test   %eax,(%r8)
3104     //   41 85 01       test   %eax,(%r9)
3105     //   41 85 02       test   %eax,(%r10)
3106     //   41 85 03       test   %eax,(%r11)
3107     //   41 85 06       test   %eax,(%r14)
3108     //   41 85 07       test   %eax,(%r15)
3109     //
3110     //      85 04 24    test   %eax,(%rsp)
3111     //   41 85 04 24    test   %eax,(%r12)
3112     //      85 45 00    test   %eax,0x0(%rbp)
3113     //   41 85 45 00    test   %eax,0x0(%r13)
3114     //
3115     // Notes:
3116     //  Format of legacy MAP0 test instruction:-
3117     //  [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32]
3118     //  o  For safepoint polling instruction "test %eax,(%rax)", encoding of first register
3119     //     operand and base register of memory operand is b/w [0-8), hence we do not require
3120     //     additional REX prefix where REX.B bit stores MSB bit of register encoding, which
3121     //     is why two bytes encoding is sufficient here.
3122     //  o  For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE
3123     //     register of memory operand is 1000, thus we need additional REX prefix in this case,
3124     //     there by adding additional byte to instruction encoding.
3125     //  o  In case BASE register is one of the 32 extended GPR registers available only on targets
3126     //     supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold
3127     //     most significant two bits of 5 bit register encoding.
3128 
3129     if (VM_Version::supports_apx_f()) {
3130       __ cmpb(Address(rbx, 0), Assembler::REX2);
3131       __ jccb(Assembler::notEqual, check_rex_prefix);
3132       __ addptr(rbx, 2);
3133       __ bind(check_rex_prefix);
3134     }
3135     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3136     __ jccb(Assembler::notEqual, no_prefix);
3137     __ addptr(rbx, 1);
3138     __ bind(no_prefix);
3139 #ifdef ASSERT
3140     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3141 #endif
3142     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3143     // r12/rsp 0x04
3144     // r13/rbp 0x05
3145     __ movzbq(rcx, Address(rbx, 1));
3146     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3147     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3148     __ cmpptr(rcx, 1);
3149     __ jccb(Assembler::above, not_special);
3150     __ addptr(rbx, 1);
3151     __ bind(not_special);
3152 #ifdef ASSERT
3153     // Verify the correct encoding of the poll we're about to skip.
3154     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3155     __ jcc(Assembler::notEqual, bail);
3156     // Mask out the modrm bits
3157     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3158     // rax encodes to 0, so if the bits are nonzero it's incorrect
3159     __ jcc(Assembler::notZero, bail);
3160 #endif
3161     // Adjust return pc forward to step over the safepoint poll instruction
3162     __ addptr(rbx, 2);
3163     __ movptr(Address(rbp, wordSize), rbx);
3164   }
3165 
3166   __ bind(no_adjust);
3167   // Normal exit, restore registers and exit.
3168   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3169   __ ret(0);
3170 
3171 #ifdef ASSERT
3172   __ bind(bail);
3173   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3174 #endif
3175 
3176   // Make sure all code is generated
3177   masm->flush();
3178 
3179   // Fill-out other meta info
3180   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3181 }
3182 
3183 //
3184 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3185 //
3186 // Generate a stub that calls into vm to find out the proper destination
3187 // of a java call. All the argument registers are live at this point
3188 // but since this is generic code we don't know what they are and the caller
3189 // must do any gc of the args.
3190 //
3191 RuntimeStub* SharedRuntime::generate_resolve_blob(SharedStubId id, address destination) {
3192   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3193   assert(is_resolve_id(id), "expected a resolve stub id");
3194 
3195   // allocate space for the code
3196   ResourceMark rm;
3197 
3198   const char* name = SharedRuntime::stub_name(id);
3199   CodeBuffer buffer(name, 1552, 512);
3200   MacroAssembler* masm = new MacroAssembler(&buffer);
3201 
3202   int frame_size_in_words;
3203 
3204   OopMapSet *oop_maps = new OopMapSet();
3205   OopMap* map = nullptr;
3206 
3207   int start = __ offset();
3208 
3209   // No need to save vector registers since they are caller-saved anyway.
3210   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3211 
3212   int frame_complete = __ offset();
3213 
3214   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3215 
3216   __ mov(c_rarg0, r15_thread);
3217 
3218   __ call(RuntimeAddress(destination));
3219 
3220 
3221   // Set an oopmap for the call site.
3222   // We need this not only for callee-saved registers, but also for volatile
3223   // registers that the compiler might be keeping live across a safepoint.
3224 
3225   oop_maps->add_gc_map( __ offset() - start, map);
3226 
3227   // rax contains the address we are going to jump to assuming no exception got installed
3228 
3229   // clear last_Java_sp
3230   __ reset_last_Java_frame(false);
3231   // check for pending exceptions
3232   Label pending;
3233   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3234   __ jcc(Assembler::notEqual, pending);
3235 
3236   // get the returned Method*
3237   __ get_vm_result_2(rbx, r15_thread);
3238   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3239 
3240   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3241 
3242   RegisterSaver::restore_live_registers(masm);
3243 
3244   // We are back to the original state on entry and ready to go.
3245 
3246   __ jmp(rax);
3247 
3248   // Pending exception after the safepoint
3249 
3250   __ bind(pending);
3251 
3252   RegisterSaver::restore_live_registers(masm);
3253 
3254   // exception pending => remove activation and forward to exception handler
3255 
3256   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
3257 
3258   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3259   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3260 
3261   // -------------
3262   // make sure all code is generated
3263   masm->flush();
3264 
3265   // return the  blob
3266   // frame_size_words or bytes??
3267   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3268 }
3269 
3270 // Continuation point for throwing of implicit exceptions that are
3271 // not handled in the current activation. Fabricates an exception
3272 // oop and initiates normal exception dispatching in this
3273 // frame. Since we need to preserve callee-saved values (currently
3274 // only for C2, but done for C1 as well) we need a callee-saved oop
3275 // map and therefore have to make these stubs into RuntimeStubs
3276 // rather than BufferBlobs.  If the compiler needs all registers to
3277 // be preserved between the fault point and the exception handler
3278 // then it must assume responsibility for that in
3279 // AbstractCompiler::continuation_for_implicit_null_exception or
3280 // continuation_for_implicit_division_by_zero_exception. All other
3281 // implicit exceptions (e.g., NullPointerException or
3282 // AbstractMethodError on entry) are either at call sites or
3283 // otherwise assume that stack unwinding will be initiated, so
3284 // caller saved registers were assumed volatile in the compiler.
3285 RuntimeStub* SharedRuntime::generate_throw_exception(SharedStubId id, address runtime_entry) {
3286   assert(is_throw_id(id), "expected a throw stub id");
3287 
3288   const char* name = SharedRuntime::stub_name(id);
3289 
3290   // Information about frame layout at time of blocking runtime call.
3291   // Note that we only have to preserve callee-saved registers since
3292   // the compilers are responsible for supplying a continuation point
3293   // if they expect all registers to be preserved.
3294   enum layout {
3295     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3296     rbp_off2,
3297     return_off,
3298     return_off2,
3299     framesize // inclusive of return address
3300   };
3301 
3302   int insts_size = 512;
3303   int locs_size  = 64;
3304 
3305   ResourceMark rm;
3306   const char* timer_msg = "SharedRuntime generate_throw_exception";
3307   TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3308 
3309   CodeBuffer code(name, insts_size, locs_size);
3310   OopMapSet* oop_maps  = new OopMapSet();
3311   MacroAssembler* masm = new MacroAssembler(&code);
3312 
3313   address start = __ pc();
3314 
3315   // This is an inlined and slightly modified version of call_VM
3316   // which has the ability to fetch the return PC out of
3317   // thread-local storage and also sets up last_Java_sp slightly
3318   // differently than the real call_VM
3319 
3320   __ enter(); // required for proper stackwalking of RuntimeStub frame
3321 
3322   assert(is_even(framesize/2), "sp not 16-byte aligned");
3323 
3324   // return address and rbp are already in place
3325   __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3326 
3327   int frame_complete = __ pc() - start;
3328 
3329   // Set up last_Java_sp and last_Java_fp
3330   address the_pc = __ pc();
3331   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3332   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3333 
3334   // Call runtime
3335   __ movptr(c_rarg0, r15_thread);
3336   BLOCK_COMMENT("call runtime_entry");
3337   __ call(RuntimeAddress(runtime_entry));
3338 
3339   // Generate oop map
3340   OopMap* map = new OopMap(framesize, 0);
3341 
3342   oop_maps->add_gc_map(the_pc - start, map);
3343 
3344   __ reset_last_Java_frame(true);
3345 
3346   __ leave(); // required for proper stackwalking of RuntimeStub frame
3347 
3348   // check for pending exceptions
3349 #ifdef ASSERT
3350   Label L;
3351   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3352   __ jcc(Assembler::notEqual, L);
3353   __ should_not_reach_here();
3354   __ bind(L);
3355 #endif // ASSERT
3356   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3357 
3358 
3359   // codeBlob framesize is in words (not VMRegImpl::slot_size)
3360   RuntimeStub* stub =
3361     RuntimeStub::new_runtime_stub(name,
3362                                   &code,
3363                                   frame_complete,
3364                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3365                                   oop_maps, false);
3366   return stub;
3367 }
3368 
3369 //------------------------------Montgomery multiplication------------------------
3370 //
3371 
3372 #ifndef _WINDOWS
3373 
3374 // Subtract 0:b from carry:a.  Return carry.
3375 static julong
3376 sub(julong a[], julong b[], julong carry, long len) {
3377   long long i = 0, cnt = len;
3378   julong tmp;
3379   asm volatile("clc; "
3380                "0: ; "
3381                "mov (%[b], %[i], 8), %[tmp]; "
3382                "sbb %[tmp], (%[a], %[i], 8); "
3383                "inc %[i]; dec %[cnt]; "
3384                "jne 0b; "
3385                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3386                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3387                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3388                : "memory");
3389   return tmp;
3390 }
3391 
3392 // Multiply (unsigned) Long A by Long B, accumulating the double-
3393 // length result into the accumulator formed of T0, T1, and T2.
3394 #define MACC(A, B, T0, T1, T2)                                  \
3395 do {                                                            \
3396   unsigned long hi, lo;                                         \
3397   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3398            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3399            : "r"(A), "a"(B) : "cc");                            \
3400  } while(0)
3401 
3402 // As above, but add twice the double-length result into the
3403 // accumulator.
3404 #define MACC2(A, B, T0, T1, T2)                                 \
3405 do {                                                            \
3406   unsigned long hi, lo;                                         \
3407   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3408            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3409            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3410            : "r"(A), "a"(B) : "cc");                            \
3411  } while(0)
3412 
3413 #else //_WINDOWS
3414 
3415 static julong
3416 sub(julong a[], julong b[], julong carry, long len) {
3417   long i;
3418   julong tmp;
3419   unsigned char c = 1;
3420   for (i = 0; i < len; i++) {
3421     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3422     a[i] = tmp;
3423   }
3424   c = _addcarry_u64(c, carry, ~0, &tmp);
3425   return tmp;
3426 }
3427 
3428 // Multiply (unsigned) Long A by Long B, accumulating the double-
3429 // length result into the accumulator formed of T0, T1, and T2.
3430 #define MACC(A, B, T0, T1, T2)                          \
3431 do {                                                    \
3432   julong hi, lo;                            \
3433   lo = _umul128(A, B, &hi);                             \
3434   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3435   c = _addcarry_u64(c, hi, T1, &T1);                    \
3436   _addcarry_u64(c, T2, 0, &T2);                         \
3437  } while(0)
3438 
3439 // As above, but add twice the double-length result into the
3440 // accumulator.
3441 #define MACC2(A, B, T0, T1, T2)                         \
3442 do {                                                    \
3443   julong hi, lo;                            \
3444   lo = _umul128(A, B, &hi);                             \
3445   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3446   c = _addcarry_u64(c, hi, T1, &T1);                    \
3447   _addcarry_u64(c, T2, 0, &T2);                         \
3448   c = _addcarry_u64(0, lo, T0, &T0);                    \
3449   c = _addcarry_u64(c, hi, T1, &T1);                    \
3450   _addcarry_u64(c, T2, 0, &T2);                         \
3451  } while(0)
3452 
3453 #endif //_WINDOWS
3454 
3455 // Fast Montgomery multiplication.  The derivation of the algorithm is
3456 // in  A Cryptographic Library for the Motorola DSP56000,
3457 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3458 
3459 static void NOINLINE
3460 montgomery_multiply(julong a[], julong b[], julong n[],
3461                     julong m[], julong inv, int len) {
3462   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3463   int i;
3464 
3465   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3466 
3467   for (i = 0; i < len; i++) {
3468     int j;
3469     for (j = 0; j < i; j++) {
3470       MACC(a[j], b[i-j], t0, t1, t2);
3471       MACC(m[j], n[i-j], t0, t1, t2);
3472     }
3473     MACC(a[i], b[0], t0, t1, t2);
3474     m[i] = t0 * inv;
3475     MACC(m[i], n[0], t0, t1, t2);
3476 
3477     assert(t0 == 0, "broken Montgomery multiply");
3478 
3479     t0 = t1; t1 = t2; t2 = 0;
3480   }
3481 
3482   for (i = len; i < 2*len; i++) {
3483     int j;
3484     for (j = i-len+1; j < len; j++) {
3485       MACC(a[j], b[i-j], t0, t1, t2);
3486       MACC(m[j], n[i-j], t0, t1, t2);
3487     }
3488     m[i-len] = t0;
3489     t0 = t1; t1 = t2; t2 = 0;
3490   }
3491 
3492   while (t0)
3493     t0 = sub(m, n, t0, len);
3494 }
3495 
3496 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3497 // multiplies so it should be up to 25% faster than Montgomery
3498 // multiplication.  However, its loop control is more complex and it
3499 // may actually run slower on some machines.
3500 
3501 static void NOINLINE
3502 montgomery_square(julong a[], julong n[],
3503                   julong m[], julong inv, int len) {
3504   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3505   int i;
3506 
3507   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3508 
3509   for (i = 0; i < len; i++) {
3510     int j;
3511     int end = (i+1)/2;
3512     for (j = 0; j < end; j++) {
3513       MACC2(a[j], a[i-j], t0, t1, t2);
3514       MACC(m[j], n[i-j], t0, t1, t2);
3515     }
3516     if ((i & 1) == 0) {
3517       MACC(a[j], a[j], t0, t1, t2);
3518     }
3519     for (; j < i; j++) {
3520       MACC(m[j], n[i-j], t0, t1, t2);
3521     }
3522     m[i] = t0 * inv;
3523     MACC(m[i], n[0], t0, t1, t2);
3524 
3525     assert(t0 == 0, "broken Montgomery square");
3526 
3527     t0 = t1; t1 = t2; t2 = 0;
3528   }
3529 
3530   for (i = len; i < 2*len; i++) {
3531     int start = i-len+1;
3532     int end = start + (len - start)/2;
3533     int j;
3534     for (j = start; j < end; j++) {
3535       MACC2(a[j], a[i-j], t0, t1, t2);
3536       MACC(m[j], n[i-j], t0, t1, t2);
3537     }
3538     if ((i & 1) == 0) {
3539       MACC(a[j], a[j], t0, t1, t2);
3540     }
3541     for (; j < len; j++) {
3542       MACC(m[j], n[i-j], t0, t1, t2);
3543     }
3544     m[i-len] = t0;
3545     t0 = t1; t1 = t2; t2 = 0;
3546   }
3547 
3548   while (t0)
3549     t0 = sub(m, n, t0, len);
3550 }
3551 
3552 // Swap words in a longword.
3553 static julong swap(julong x) {
3554   return (x << 32) | (x >> 32);
3555 }
3556 
3557 // Copy len longwords from s to d, word-swapping as we go.  The
3558 // destination array is reversed.
3559 static void reverse_words(julong *s, julong *d, int len) {
3560   d += len;
3561   while(len-- > 0) {
3562     d--;
3563     *d = swap(*s);
3564     s++;
3565   }
3566 }
3567 
3568 // The threshold at which squaring is advantageous was determined
3569 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3570 #define MONTGOMERY_SQUARING_THRESHOLD 64
3571 
3572 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3573                                         jint len, jlong inv,
3574                                         jint *m_ints) {
3575   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3576   int longwords = len/2;
3577 
3578   // Make very sure we don't use so much space that the stack might
3579   // overflow.  512 jints corresponds to an 16384-bit integer and
3580   // will use here a total of 8k bytes of stack space.
3581   int divisor = sizeof(julong) * 4;
3582   guarantee(longwords <= 8192 / divisor, "must be");
3583   int total_allocation = longwords * sizeof (julong) * 4;
3584   julong *scratch = (julong *)alloca(total_allocation);
3585 
3586   // Local scratch arrays
3587   julong
3588     *a = scratch + 0 * longwords,
3589     *b = scratch + 1 * longwords,
3590     *n = scratch + 2 * longwords,
3591     *m = scratch + 3 * longwords;
3592 
3593   reverse_words((julong *)a_ints, a, longwords);
3594   reverse_words((julong *)b_ints, b, longwords);
3595   reverse_words((julong *)n_ints, n, longwords);
3596 
3597   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3598 
3599   reverse_words(m, (julong *)m_ints, longwords);
3600 }
3601 
3602 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3603                                       jint len, jlong inv,
3604                                       jint *m_ints) {
3605   assert(len % 2 == 0, "array length in montgomery_square must be even");
3606   int longwords = len/2;
3607 
3608   // Make very sure we don't use so much space that the stack might
3609   // overflow.  512 jints corresponds to an 16384-bit integer and
3610   // will use here a total of 6k bytes of stack space.
3611   int divisor = sizeof(julong) * 3;
3612   guarantee(longwords <= (8192 / divisor), "must be");
3613   int total_allocation = longwords * sizeof (julong) * 3;
3614   julong *scratch = (julong *)alloca(total_allocation);
3615 
3616   // Local scratch arrays
3617   julong
3618     *a = scratch + 0 * longwords,
3619     *n = scratch + 1 * longwords,
3620     *m = scratch + 2 * longwords;
3621 
3622   reverse_words((julong *)a_ints, a, longwords);
3623   reverse_words((julong *)n_ints, n, longwords);
3624 
3625   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3626     ::montgomery_square(a, n, m, (julong)inv, longwords);
3627   } else {
3628     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3629   }
3630 
3631   reverse_words(m, (julong *)m_ints, longwords);
3632 }
3633 
3634 #if INCLUDE_JFR
3635 
3636 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
3637 // It returns a jobject handle to the event writer.
3638 // The handle is dereferenced and the return value is the event writer oop.
3639 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
3640   enum layout {
3641     rbp_off,
3642     rbpH_off,
3643     return_off,
3644     return_off2,
3645     framesize // inclusive of return address
3646   };
3647 
3648   const char* name = SharedRuntime::stub_name(SharedStubId::jfr_write_checkpoint_id);
3649   CodeBuffer code(name, 1024, 64);
3650   MacroAssembler* masm = new MacroAssembler(&code);
3651   address start = __ pc();
3652 
3653   __ enter();
3654   address the_pc = __ pc();
3655 
3656   int frame_complete = the_pc - start;
3657 
3658   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3659   __ movptr(c_rarg0, r15_thread);
3660   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
3661   __ reset_last_Java_frame(true);
3662 
3663   // rax is jobject handle result, unpack and process it through a barrier.
3664   __ resolve_global_jobject(rax, r15_thread, c_rarg0);
3665 
3666   __ leave();
3667   __ ret(0);
3668 
3669   OopMapSet* oop_maps = new OopMapSet();
3670   OopMap* map = new OopMap(framesize, 1);
3671   oop_maps->add_gc_map(frame_complete, map);
3672 
3673   RuntimeStub* stub =
3674     RuntimeStub::new_runtime_stub(name,
3675                                   &code,
3676                                   frame_complete,
3677                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3678                                   oop_maps,
3679                                   false);
3680   return stub;
3681 }
3682 
3683 // For c2: call to return a leased buffer.
3684 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
3685   enum layout {
3686     rbp_off,
3687     rbpH_off,
3688     return_off,
3689     return_off2,
3690     framesize // inclusive of return address
3691   };
3692 
3693   const char* name = SharedRuntime::stub_name(SharedStubId::jfr_return_lease_id);
3694   CodeBuffer code(name, 1024, 64);
3695   MacroAssembler* masm = new MacroAssembler(&code);
3696   address start = __ pc();
3697 
3698   __ enter();
3699   address the_pc = __ pc();
3700 
3701   int frame_complete = the_pc - start;
3702 
3703   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
3704   __ movptr(c_rarg0, r15_thread);
3705   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
3706   __ reset_last_Java_frame(true);
3707 
3708   __ leave();
3709   __ ret(0);
3710 
3711   OopMapSet* oop_maps = new OopMapSet();
3712   OopMap* map = new OopMap(framesize, 1);
3713   oop_maps->add_gc_map(frame_complete, map);
3714 
3715   RuntimeStub* stub =
3716     RuntimeStub::new_runtime_stub(name,
3717                                   &code,
3718                                   frame_complete,
3719                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3720                                   oop_maps,
3721                                   false);
3722   return stub;
3723 }
3724 
3725 #endif // INCLUDE_JFR
3726