1 /*
   2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef _WINDOWS
  26 #include "alloca.h"
  27 #endif
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "code/compiledIC.hpp"
  31 #include "code/debugInfoRec.hpp"
  32 #include "code/nativeInst.hpp"
  33 #include "code/vtableStubs.hpp"
  34 #include "compiler/oopMap.hpp"
  35 #include "gc/shared/collectedHeap.hpp"
  36 #include "gc/shared/gcLocker.hpp"
  37 #include "gc/shared/barrierSet.hpp"
  38 #include "gc/shared/barrierSetAssembler.hpp"
  39 #include "interpreter/interpreter.hpp"
  40 #include "logging/log.hpp"
  41 #include "memory/resourceArea.hpp"
  42 #include "memory/universe.hpp"
  43 #include "oops/klass.inline.hpp"
  44 #include "oops/method.inline.hpp"
  45 #include "prims/methodHandles.hpp"
  46 #include "runtime/continuation.hpp"
  47 #include "runtime/continuationEntry.inline.hpp"
  48 #include "runtime/globals.hpp"
  49 #include "runtime/jniHandles.hpp"
  50 #include "runtime/safepointMechanism.hpp"
  51 #include "runtime/sharedRuntime.hpp"
  52 #include "runtime/signature.hpp"
  53 #include "runtime/stubRoutines.hpp"
  54 #include "runtime/timerTrace.hpp"
  55 #include "runtime/vframeArray.hpp"
  56 #include "runtime/vm_version.hpp"
  57 #include "utilities/align.hpp"
  58 #include "utilities/checkedCast.hpp"
  59 #include "utilities/formatBuffer.hpp"
  60 #include "vmreg_x86.inline.hpp"
  61 #ifdef COMPILER1
  62 #include "c1/c1_Runtime1.hpp"
  63 #endif
  64 #ifdef COMPILER2
  65 #include "opto/runtime.hpp"
  66 #endif
  67 #if INCLUDE_JVMCI
  68 #include "jvmci/jvmciJavaClasses.hpp"
  69 #endif
  70 
  71 #define __ masm->
  72 
  73 #ifdef PRODUCT
  74 #define BLOCK_COMMENT(str) /* nothing */
  75 #else
  76 #define BLOCK_COMMENT(str) __ block_comment(str)
  77 #endif // PRODUCT
  78 
  79 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  80 
  81 class RegisterSaver {
  82   // Capture info about frame layout.  Layout offsets are in jint
  83   // units because compiler frame slots are jints.
  84 #define XSAVE_AREA_BEGIN 160
  85 #define XSAVE_AREA_YMM_BEGIN 576
  86 #define XSAVE_AREA_EGPRS 960
  87 #define XSAVE_AREA_OPMASK_BEGIN 1088
  88 #define XSAVE_AREA_ZMM_BEGIN 1152
  89 #define XSAVE_AREA_UPPERBANK 1664
  90 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  91 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  92 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
  93 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
  94 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
  95   enum layout {
  96     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt,    // fxsave save area
  97     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,  // offset in fxsave save area
  98     DEF_XMM_OFFS(0),
  99     DEF_XMM_OFFS(1),
 100     // 2..15 are implied in range usage
 101     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 102     DEF_YMM_OFFS(0),
 103     DEF_YMM_OFFS(1),
 104     // 2..15 are implied in range usage
 105     r31_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
 106     r31H_off,
 107     r30_off, r30H_off,
 108     r29_off, r29H_off,
 109     r28_off, r28H_off,
 110     r27_off, r27H_off,
 111     r26_off, r26H_off,
 112     r25_off, r25H_off,
 113     r24_off, r24H_off,
 114     r23_off, r23H_off,
 115     r22_off, r22H_off,
 116     r21_off, r21H_off,
 117     r20_off, r20H_off,
 118     r19_off, r19H_off,
 119     r18_off, r18H_off,
 120     r17_off, r17H_off,
 121     r16_off, r16H_off,
 122     opmask_off   = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 123     DEF_OPMASK_OFFS(0),
 124     DEF_OPMASK_OFFS(1),
 125     // 2..7 are implied in range usage
 126     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 127     DEF_ZMM_OFFS(0),
 128     DEF_ZMM_OFFS(1),
 129     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 130     DEF_ZMM_UPPER_OFFS(16),
 131     DEF_ZMM_UPPER_OFFS(17),
 132     // 18..31 are implied in range usage
 133     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 134     fpu_stateH_end,
 135     r15_off, r15H_off,
 136     r14_off, r14H_off,
 137     r13_off, r13H_off,
 138     r12_off, r12H_off,
 139     r11_off, r11H_off,
 140     r10_off, r10H_off,
 141     r9_off,  r9H_off,
 142     r8_off,  r8H_off,
 143     rdi_off, rdiH_off,
 144     rsi_off, rsiH_off,
 145     ignore_off, ignoreH_off,  // extra copy of rbp
 146     rsp_off, rspH_off,
 147     rbx_off, rbxH_off,
 148     rdx_off, rdxH_off,
 149     rcx_off, rcxH_off,
 150     rax_off, raxH_off,
 151     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 152     align_off, alignH_off,
 153     flags_off, flagsH_off,
 154     // The frame sender code expects that rbp will be in the "natural" place and
 155     // will override any oopMap setting for it. We must therefore force the layout
 156     // so that it agrees with the frame sender code.
 157     rbp_off, rbpH_off,        // copy of rbp we will restore
 158     return_off, returnH_off,  // slot for return address
 159     reg_save_size             // size in compiler stack slots
 160   };
 161 
 162  public:
 163   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 164   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 165 
 166   // Offsets into the register save area
 167   // Used by deoptimization when it is managing result register
 168   // values on its own
 169 
 170   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 171   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 172   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 173   static int r15_offset_in_bytes(void)    { return BytesPerInt * r15_off; }
 174   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 175   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 176 
 177   // During deoptimization only the result registers need to be restored,
 178   // all the other values have already been extracted.
 179   static void restore_result_registers(MacroAssembler* masm);
 180 };
 181 
 182 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 183   int off = 0;
 184   int num_xmm_regs = XMMRegister::available_xmm_registers();
 185 #if COMPILER2_OR_JVMCI
 186   if (save_wide_vectors && UseAVX == 0) {
 187     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 188   }
 189   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 190 #else
 191   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 192 #endif
 193 
 194   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 195   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 196   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 197   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 198   // CodeBlob frame size is in words.
 199   int frame_size_in_words = frame_size_in_bytes / wordSize;
 200   *total_frame_words = frame_size_in_words;
 201 
 202   // Save registers, fpu state, and flags.
 203   // We assume caller has already pushed the return address onto the
 204   // stack, so rsp is 8-byte aligned here.
 205   // We push rpb twice in this sequence because we want the real rbp
 206   // to be under the return like a normal enter.
 207 
 208   __ enter();          // rsp becomes 16-byte aligned here
 209   __ pushf();
 210   // Make sure rsp stays 16-byte aligned
 211   __ subq(rsp, 8);
 212   // Push CPU state in multiple of 16 bytes
 213   __ save_legacy_gprs();
 214   __ push_FPU_state();
 215 
 216 
 217   // push cpu state handles this on EVEX enabled targets
 218   if (save_wide_vectors) {
 219     // Save upper half of YMM registers(0..15)
 220     int base_addr = XSAVE_AREA_YMM_BEGIN;
 221     for (int n = 0; n < 16; n++) {
 222       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 223     }
 224     if (VM_Version::supports_evex()) {
 225       // Save upper half of ZMM registers(0..15)
 226       base_addr = XSAVE_AREA_ZMM_BEGIN;
 227       for (int n = 0; n < 16; n++) {
 228         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 229       }
 230       // Save full ZMM registers(16..num_xmm_regs)
 231       base_addr = XSAVE_AREA_UPPERBANK;
 232       off = 0;
 233       int vector_len = Assembler::AVX_512bit;
 234       for (int n = 16; n < num_xmm_regs; n++) {
 235         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 236       }
 237 #if COMPILER2_OR_JVMCI
 238       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 239       off = 0;
 240       for(int n = 0; n < KRegister::number_of_registers; n++) {
 241         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 242       }
 243 #endif
 244     }
 245   } else {
 246     if (VM_Version::supports_evex()) {
 247       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 248       int base_addr = XSAVE_AREA_UPPERBANK;
 249       off = 0;
 250       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 251       for (int n = 16; n < num_xmm_regs; n++) {
 252         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 253       }
 254 #if COMPILER2_OR_JVMCI
 255       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 256       off = 0;
 257       for(int n = 0; n < KRegister::number_of_registers; n++) {
 258         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 259       }
 260 #endif
 261     }
 262   }
 263 
 264 #if COMPILER2_OR_JVMCI
 265   if (UseAPX) {
 266       int base_addr = XSAVE_AREA_EGPRS;
 267       off = 0;
 268       for(int n = 16; n < Register::number_of_registers; n++) {
 269         __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
 270       }
 271   }
 272 #endif
 273 
 274   __ vzeroupper();
 275   if (frame::arg_reg_save_area_bytes != 0) {
 276     // Allocate argument register save area
 277     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 278   }
 279 
 280   // Set an oopmap for the call site.  This oopmap will map all
 281   // oop-registers and debug-info registers as callee-saved.  This
 282   // will allow deoptimization at this safepoint to find all possible
 283   // debug-info recordings, as well as let GC find all oops.
 284 
 285   OopMapSet *oop_maps = new OopMapSet();
 286   OopMap* map = new OopMap(frame_size_in_slots, 0);
 287 
 288 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 289 
 290   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 291   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 292   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 293   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 294   // rbp location is known implicitly by the frame sender code, needs no oopmap
 295   // and the location where rbp was saved by is ignored
 296   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 297   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 298   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 299   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 300   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 301   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 302   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 303   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 304   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 305   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 306 
 307   if (UseAPX) {
 308     map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
 309     map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
 310     map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
 311     map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
 312     map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
 313     map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
 314     map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
 315     map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
 316     map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
 317     map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
 318     map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
 319     map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
 320     map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
 321     map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
 322     map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
 323     map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
 324   }
 325   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 326   // on EVEX enabled targets, we get it included in the xsave area
 327   off = xmm0_off;
 328   int delta = xmm1_off - off;
 329   for (int n = 0; n < 16; n++) {
 330     XMMRegister xmm_name = as_XMMRegister(n);
 331     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 332     off += delta;
 333   }
 334   if (UseAVX > 2) {
 335     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 336     off = zmm16_off;
 337     delta = zmm17_off - off;
 338     for (int n = 16; n < num_xmm_regs; n++) {
 339       XMMRegister zmm_name = as_XMMRegister(n);
 340       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 341       off += delta;
 342     }
 343   }
 344 
 345 #if COMPILER2_OR_JVMCI
 346   if (save_wide_vectors) {
 347     // Save upper half of YMM registers(0..15)
 348     off = ymm0_off;
 349     delta = ymm1_off - ymm0_off;
 350     for (int n = 0; n < 16; n++) {
 351       XMMRegister ymm_name = as_XMMRegister(n);
 352       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 353       off += delta;
 354     }
 355     if (VM_Version::supports_evex()) {
 356       // Save upper half of ZMM registers(0..15)
 357       off = zmm0_off;
 358       delta = zmm1_off - zmm0_off;
 359       for (int n = 0; n < 16; n++) {
 360         XMMRegister zmm_name = as_XMMRegister(n);
 361         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 362         off += delta;
 363       }
 364     }
 365   }
 366 #endif // COMPILER2_OR_JVMCI
 367 
 368   // %%% These should all be a waste but we'll keep things as they were for now
 369   if (true) {
 370     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 371     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 372     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 373     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 374     // rbp location is known implicitly by the frame sender code, needs no oopmap
 375     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 376     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 377     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 378     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 379     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 380     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 381     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 382     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 383     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 384     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 385     if (UseAPX) {
 386       map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
 387       map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
 388       map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
 389       map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
 390       map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
 391       map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
 392       map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
 393       map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
 394       map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
 395       map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
 396       map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
 397       map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
 398       map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
 399       map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
 400       map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
 401       map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
 402     }
 403     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 404     // on EVEX enabled targets, we get it included in the xsave area
 405     off = xmm0H_off;
 406     delta = xmm1H_off - off;
 407     for (int n = 0; n < 16; n++) {
 408       XMMRegister xmm_name = as_XMMRegister(n);
 409       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 410       off += delta;
 411     }
 412     if (UseAVX > 2) {
 413       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 414       off = zmm16H_off;
 415       delta = zmm17H_off - off;
 416       for (int n = 16; n < num_xmm_regs; n++) {
 417         XMMRegister zmm_name = as_XMMRegister(n);
 418         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 419         off += delta;
 420       }
 421     }
 422   }
 423 
 424   return map;
 425 }
 426 
 427 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 428   int num_xmm_regs = XMMRegister::available_xmm_registers();
 429   if (frame::arg_reg_save_area_bytes != 0) {
 430     // Pop arg register save area
 431     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 432   }
 433 
 434 #if COMPILER2_OR_JVMCI
 435   if (restore_wide_vectors) {
 436     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 437     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 438   }
 439 #else
 440   assert(!restore_wide_vectors, "vectors are generated only by C2");
 441 #endif
 442 
 443   __ vzeroupper();
 444 
 445   // On EVEX enabled targets everything is handled in pop fpu state
 446   if (restore_wide_vectors) {
 447     // Restore upper half of YMM registers (0..15)
 448     int base_addr = XSAVE_AREA_YMM_BEGIN;
 449     for (int n = 0; n < 16; n++) {
 450       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 451     }
 452     if (VM_Version::supports_evex()) {
 453       // Restore upper half of ZMM registers (0..15)
 454       base_addr = XSAVE_AREA_ZMM_BEGIN;
 455       for (int n = 0; n < 16; n++) {
 456         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 457       }
 458       // Restore full ZMM registers(16..num_xmm_regs)
 459       base_addr = XSAVE_AREA_UPPERBANK;
 460       int vector_len = Assembler::AVX_512bit;
 461       int off = 0;
 462       for (int n = 16; n < num_xmm_regs; n++) {
 463         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 464       }
 465 #if COMPILER2_OR_JVMCI
 466       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 467       off = 0;
 468       for (int n = 0; n < KRegister::number_of_registers; n++) {
 469         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 470       }
 471 #endif
 472     }
 473   } else {
 474     if (VM_Version::supports_evex()) {
 475       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 476       int base_addr = XSAVE_AREA_UPPERBANK;
 477       int off = 0;
 478       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 479       for (int n = 16; n < num_xmm_regs; n++) {
 480         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 481       }
 482 #if COMPILER2_OR_JVMCI
 483       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 484       off = 0;
 485       for (int n = 0; n < KRegister::number_of_registers; n++) {
 486         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 487       }
 488 #endif
 489     }
 490   }
 491 
 492 #if COMPILER2_OR_JVMCI
 493   if (UseAPX) {
 494     int base_addr = XSAVE_AREA_EGPRS;
 495     int off = 0;
 496     for (int n = 16; n < Register::number_of_registers; n++) {
 497       __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
 498     }
 499   }
 500 #endif
 501 
 502   // Recover CPU state
 503   __ pop_FPU_state();
 504   __ restore_legacy_gprs();
 505   __ addq(rsp, 8);
 506   __ popf();
 507   // Get the rbp described implicitly by the calling convention (no oopMap)
 508   __ pop(rbp);
 509 }
 510 
 511 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 512 
 513   // Just restore result register. Only used by deoptimization. By
 514   // now any callee save register that needs to be restored to a c2
 515   // caller of the deoptee has been extracted into the vframeArray
 516   // and will be stuffed into the c2i adapter we create for later
 517   // restoration so only result registers need to be restored here.
 518 
 519   // Restore fp result register
 520   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 521   // Restore integer result register
 522   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 523   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 524 
 525   // Pop all of the register save are off the stack except the return address
 526   __ addptr(rsp, return_offset_in_bytes());
 527 }
 528 
 529 // Is vector's size (in bytes) bigger than a size saved by default?
 530 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 531 bool SharedRuntime::is_wide_vector(int size) {
 532   return size > 16;
 533 }
 534 
 535 // ---------------------------------------------------------------------------
 536 // Read the array of BasicTypes from a signature, and compute where the
 537 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 538 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 539 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 540 // as framesizes are fixed.
 541 // VMRegImpl::stack0 refers to the first slot 0(sp).
 542 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 543 // Register up to Register::number_of_registers are the 64-bit
 544 // integer registers.
 545 
 546 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 547 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 548 // units regardless of build. Of course for i486 there is no 64 bit build
 549 
 550 // The Java calling convention is a "shifted" version of the C ABI.
 551 // By skipping the first C ABI register we can call non-static jni methods
 552 // with small numbers of arguments without having to shuffle the arguments
 553 // at all. Since we control the java ABI we ought to at least get some
 554 // advantage out of it.
 555 
 556 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 557                                            VMRegPair *regs,
 558                                            int total_args_passed) {
 559 
 560   // Create the mapping between argument positions and
 561   // registers.
 562   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 563     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 564   };
 565   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 566     j_farg0, j_farg1, j_farg2, j_farg3,
 567     j_farg4, j_farg5, j_farg6, j_farg7
 568   };
 569 
 570 
 571   uint int_args = 0;
 572   uint fp_args = 0;
 573   uint stk_args = 0;
 574 
 575   for (int i = 0; i < total_args_passed; i++) {
 576     switch (sig_bt[i]) {
 577     case T_BOOLEAN:
 578     case T_CHAR:
 579     case T_BYTE:
 580     case T_SHORT:
 581     case T_INT:
 582       if (int_args < Argument::n_int_register_parameters_j) {
 583         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 584       } else {
 585         stk_args = align_up(stk_args, 2);
 586         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 587         stk_args += 1;
 588       }
 589       break;
 590     case T_VOID:
 591       // halves of T_LONG or T_DOUBLE
 592       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 593       regs[i].set_bad();
 594       break;
 595     case T_LONG:
 596       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 597       // fall through
 598     case T_OBJECT:
 599     case T_ARRAY:
 600     case T_ADDRESS:
 601       if (int_args < Argument::n_int_register_parameters_j) {
 602         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 603       } else {
 604         stk_args = align_up(stk_args, 2);
 605         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 606         stk_args += 2;
 607       }
 608       break;
 609     case T_FLOAT:
 610       if (fp_args < Argument::n_float_register_parameters_j) {
 611         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 612       } else {
 613         stk_args = align_up(stk_args, 2);
 614         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 615         stk_args += 1;
 616       }
 617       break;
 618     case T_DOUBLE:
 619       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 620       if (fp_args < Argument::n_float_register_parameters_j) {
 621         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 622       } else {
 623         stk_args = align_up(stk_args, 2);
 624         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 625         stk_args += 2;
 626       }
 627       break;
 628     default:
 629       ShouldNotReachHere();
 630       break;
 631     }
 632   }
 633 
 634   return stk_args;
 635 }
 636 
 637 // Patch the callers callsite with entry to compiled code if it exists.
 638 static void patch_callers_callsite(MacroAssembler *masm) {
 639   Label L;
 640   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 641   __ jcc(Assembler::equal, L);
 642 
 643   // Save the current stack pointer
 644   __ mov(r13, rsp);
 645   // Schedule the branch target address early.
 646   // Call into the VM to patch the caller, then jump to compiled callee
 647   // rax isn't live so capture return address while we easily can
 648   __ movptr(rax, Address(rsp, 0));
 649 
 650   // align stack so push_CPU_state doesn't fault
 651   __ andptr(rsp, -(StackAlignmentInBytes));
 652   __ push_CPU_state();
 653   __ vzeroupper();
 654   // VM needs caller's callsite
 655   // VM needs target method
 656   // This needs to be a long call since we will relocate this adapter to
 657   // the codeBuffer and it may not reach
 658 
 659   // Allocate argument register save area
 660   if (frame::arg_reg_save_area_bytes != 0) {
 661     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 662   }
 663   __ mov(c_rarg0, rbx);
 664   __ mov(c_rarg1, rax);
 665   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 666 
 667   // De-allocate argument register save area
 668   if (frame::arg_reg_save_area_bytes != 0) {
 669     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 670   }
 671 
 672   __ vzeroupper();
 673   __ pop_CPU_state();
 674   // restore sp
 675   __ mov(rsp, r13);
 676   __ bind(L);
 677 }
 678 
 679 
 680 static void gen_c2i_adapter(MacroAssembler *masm,
 681                             int total_args_passed,
 682                             int comp_args_on_stack,
 683                             const BasicType *sig_bt,
 684                             const VMRegPair *regs,
 685                             Label& skip_fixup) {
 686   // Before we get into the guts of the C2I adapter, see if we should be here
 687   // at all.  We've come from compiled code and are attempting to jump to the
 688   // interpreter, which means the caller made a static call to get here
 689   // (vcalls always get a compiled target if there is one).  Check for a
 690   // compiled target.  If there is one, we need to patch the caller's call.
 691   patch_callers_callsite(masm);
 692 
 693   __ bind(skip_fixup);
 694 
 695   // Since all args are passed on the stack, total_args_passed *
 696   // Interpreter::stackElementSize is the space we need.
 697 
 698   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 699 
 700   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 701 
 702   // stack is aligned, keep it that way
 703   // This is not currently needed or enforced by the interpreter, but
 704   // we might as well conform to the ABI.
 705   extraspace = align_up(extraspace, 2*wordSize);
 706 
 707   // set senderSP value
 708   __ lea(r13, Address(rsp, wordSize));
 709 
 710 #ifdef ASSERT
 711   __ check_stack_alignment(r13, "sender stack not aligned");
 712 #endif
 713   if (extraspace > 0) {
 714     // Pop the return address
 715     __ pop(rax);
 716 
 717     __ subptr(rsp, extraspace);
 718 
 719     // Push the return address
 720     __ push(rax);
 721 
 722     // Account for the return address location since we store it first rather
 723     // than hold it in a register across all the shuffling
 724     extraspace += wordSize;
 725   }
 726 
 727 #ifdef ASSERT
 728   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 729 #endif
 730 
 731   // Now write the args into the outgoing interpreter space
 732   for (int i = 0; i < total_args_passed; i++) {
 733     if (sig_bt[i] == T_VOID) {
 734       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 735       continue;
 736     }
 737 
 738     // offset to start parameters
 739     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 740     int next_off = st_off - Interpreter::stackElementSize;
 741 
 742     // Say 4 args:
 743     // i   st_off
 744     // 0   32 T_LONG
 745     // 1   24 T_VOID
 746     // 2   16 T_OBJECT
 747     // 3    8 T_BOOL
 748     // -    0 return address
 749     //
 750     // However to make thing extra confusing. Because we can fit a long/double in
 751     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 752     // leaves one slot empty and only stores to a single slot. In this case the
 753     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 754 
 755     VMReg r_1 = regs[i].first();
 756     VMReg r_2 = regs[i].second();
 757     if (!r_1->is_valid()) {
 758       assert(!r_2->is_valid(), "");
 759       continue;
 760     }
 761     if (r_1->is_stack()) {
 762       // memory to memory use rax
 763       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 764       if (!r_2->is_valid()) {
 765         // sign extend??
 766         __ movl(rax, Address(rsp, ld_off));
 767         __ movptr(Address(rsp, st_off), rax);
 768 
 769       } else {
 770 
 771         __ movq(rax, Address(rsp, ld_off));
 772 
 773         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 774         // T_DOUBLE and T_LONG use two slots in the interpreter
 775         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 776           // ld_off == LSW, ld_off+wordSize == MSW
 777           // st_off == MSW, next_off == LSW
 778           __ movq(Address(rsp, next_off), rax);
 779 #ifdef ASSERT
 780           // Overwrite the unused slot with known junk
 781           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 782           __ movptr(Address(rsp, st_off), rax);
 783 #endif /* ASSERT */
 784         } else {
 785           __ movq(Address(rsp, st_off), rax);
 786         }
 787       }
 788     } else if (r_1->is_Register()) {
 789       Register r = r_1->as_Register();
 790       if (!r_2->is_valid()) {
 791         // must be only an int (or less ) so move only 32bits to slot
 792         // why not sign extend??
 793         __ movl(Address(rsp, st_off), r);
 794       } else {
 795         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 796         // T_DOUBLE and T_LONG use two slots in the interpreter
 797         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 798           // long/double in gpr
 799 #ifdef ASSERT
 800           // Overwrite the unused slot with known junk
 801           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 802           __ movptr(Address(rsp, st_off), rax);
 803 #endif /* ASSERT */
 804           __ movq(Address(rsp, next_off), r);
 805         } else {
 806           __ movptr(Address(rsp, st_off), r);
 807         }
 808       }
 809     } else {
 810       assert(r_1->is_XMMRegister(), "");
 811       if (!r_2->is_valid()) {
 812         // only a float use just part of the slot
 813         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 814       } else {
 815 #ifdef ASSERT
 816         // Overwrite the unused slot with known junk
 817         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 818         __ movptr(Address(rsp, st_off), rax);
 819 #endif /* ASSERT */
 820         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 821       }
 822     }
 823   }
 824 
 825   // Schedule the branch target address early.
 826   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 827   __ jmp(rcx);
 828 }
 829 
 830 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 831                         address code_start, address code_end,
 832                         Label& L_ok) {
 833   Label L_fail;
 834   __ lea(temp_reg, AddressLiteral(code_start, relocInfo::none));
 835   __ cmpptr(pc_reg, temp_reg);
 836   __ jcc(Assembler::belowEqual, L_fail);
 837   __ lea(temp_reg, AddressLiteral(code_end, relocInfo::none));
 838   __ cmpptr(pc_reg, temp_reg);
 839   __ jcc(Assembler::below, L_ok);
 840   __ bind(L_fail);
 841 }
 842 
 843 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 844                                     int total_args_passed,
 845                                     int comp_args_on_stack,
 846                                     const BasicType *sig_bt,
 847                                     const VMRegPair *regs) {
 848 
 849   // Note: r13 contains the senderSP on entry. We must preserve it since
 850   // we may do a i2c -> c2i transition if we lose a race where compiled
 851   // code goes non-entrant while we get args ready.
 852   // In addition we use r13 to locate all the interpreter args as
 853   // we must align the stack to 16 bytes on an i2c entry else we
 854   // lose alignment we expect in all compiled code and register
 855   // save code can segv when fxsave instructions find improperly
 856   // aligned stack pointer.
 857 
 858   // Adapters can be frameless because they do not require the caller
 859   // to perform additional cleanup work, such as correcting the stack pointer.
 860   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 861   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 862   // even if a callee has modified the stack pointer.
 863   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 864   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 865   // up via the senderSP register).
 866   // In other words, if *either* the caller or callee is interpreted, we can
 867   // get the stack pointer repaired after a call.
 868   // This is why c2i and i2c adapters cannot be indefinitely composed.
 869   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 870   // both caller and callee would be compiled methods, and neither would
 871   // clean up the stack pointer changes performed by the two adapters.
 872   // If this happens, control eventually transfers back to the compiled
 873   // caller, but with an uncorrected stack, causing delayed havoc.
 874 
 875   if (VerifyAdapterCalls &&
 876       (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) {
 877     // So, let's test for cascading c2i/i2c adapters right now.
 878     //  assert(Interpreter::contains($return_addr) ||
 879     //         StubRoutines::contains($return_addr),
 880     //         "i2c adapter must return to an interpreter frame");
 881     __ block_comment("verify_i2c { ");
 882     // Pick up the return address
 883     __ movptr(rax, Address(rsp, 0));
 884     Label L_ok;
 885     if (Interpreter::code() != nullptr) {
 886       range_check(masm, rax, r11,
 887                   Interpreter::code()->code_start(),
 888                   Interpreter::code()->code_end(),
 889                   L_ok);
 890     }
 891     if (StubRoutines::initial_stubs_code() != nullptr) {
 892       range_check(masm, rax, r11,
 893                   StubRoutines::initial_stubs_code()->code_begin(),
 894                   StubRoutines::initial_stubs_code()->code_end(),
 895                   L_ok);
 896     }
 897     if (StubRoutines::final_stubs_code() != nullptr) {
 898       range_check(masm, rax, r11,
 899                   StubRoutines::final_stubs_code()->code_begin(),
 900                   StubRoutines::final_stubs_code()->code_end(),
 901                   L_ok);
 902     }
 903     const char* msg = "i2c adapter must return to an interpreter frame";
 904     __ block_comment(msg);
 905     __ stop(msg);
 906     __ bind(L_ok);
 907     __ block_comment("} verify_i2ce ");
 908   }
 909 
 910   // Must preserve original SP for loading incoming arguments because
 911   // we need to align the outgoing SP for compiled code.
 912   __ movptr(r11, rsp);
 913 
 914   // Pick up the return address
 915   __ pop(rax);
 916 
 917   // Convert 4-byte c2 stack slots to words.
 918   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 919 
 920   if (comp_args_on_stack) {
 921     __ subptr(rsp, comp_words_on_stack * wordSize);
 922   }
 923 
 924   // Ensure compiled code always sees stack at proper alignment
 925   __ andptr(rsp, -16);
 926 
 927   // push the return address and misalign the stack that youngest frame always sees
 928   // as far as the placement of the call instruction
 929   __ push(rax);
 930 
 931   // Put saved SP in another register
 932   const Register saved_sp = rax;
 933   __ movptr(saved_sp, r11);
 934 
 935   // Will jump to the compiled code just as if compiled code was doing it.
 936   // Pre-load the register-jump target early, to schedule it better.
 937   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 938 
 939 #if INCLUDE_JVMCI
 940   if (EnableJVMCI) {
 941     // check if this call should be routed towards a specific entry point
 942     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 943     Label no_alternative_target;
 944     __ jcc(Assembler::equal, no_alternative_target);
 945     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 946     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 947     __ bind(no_alternative_target);
 948   }
 949 #endif // INCLUDE_JVMCI
 950 
 951   // Now generate the shuffle code.  Pick up all register args and move the
 952   // rest through the floating point stack top.
 953   for (int i = 0; i < total_args_passed; i++) {
 954     if (sig_bt[i] == T_VOID) {
 955       // Longs and doubles are passed in native word order, but misaligned
 956       // in the 32-bit build.
 957       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 958       continue;
 959     }
 960 
 961     // Pick up 0, 1 or 2 words from SP+offset.
 962 
 963     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 964             "scrambled load targets?");
 965     // Load in argument order going down.
 966     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 967     // Point to interpreter value (vs. tag)
 968     int next_off = ld_off - Interpreter::stackElementSize;
 969     //
 970     //
 971     //
 972     VMReg r_1 = regs[i].first();
 973     VMReg r_2 = regs[i].second();
 974     if (!r_1->is_valid()) {
 975       assert(!r_2->is_valid(), "");
 976       continue;
 977     }
 978     if (r_1->is_stack()) {
 979       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 980       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 981 
 982       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 983       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 984       // will be generated.
 985       if (!r_2->is_valid()) {
 986         // sign extend???
 987         __ movl(r13, Address(saved_sp, ld_off));
 988         __ movptr(Address(rsp, st_off), r13);
 989       } else {
 990         //
 991         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 992         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 993         // So we must adjust where to pick up the data to match the interpreter.
 994         //
 995         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 996         // are accessed as negative so LSW is at LOW address
 997 
 998         // ld_off is MSW so get LSW
 999         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
1000                            next_off : ld_off;
1001         __ movq(r13, Address(saved_sp, offset));
1002         // st_off is LSW (i.e. reg.first())
1003         __ movq(Address(rsp, st_off), r13);
1004       }
1005     } else if (r_1->is_Register()) {  // Register argument
1006       Register r = r_1->as_Register();
1007       assert(r != rax, "must be different");
1008       if (r_2->is_valid()) {
1009         //
1010         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1011         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1012         // So we must adjust where to pick up the data to match the interpreter.
1013 
1014         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
1015                            next_off : ld_off;
1016 
1017         // this can be a misaligned move
1018         __ movq(r, Address(saved_sp, offset));
1019       } else {
1020         // sign extend and use a full word?
1021         __ movl(r, Address(saved_sp, ld_off));
1022       }
1023     } else {
1024       if (!r_2->is_valid()) {
1025         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
1026       } else {
1027         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
1028       }
1029     }
1030   }
1031 
1032   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
1033 
1034   // 6243940 We might end up in handle_wrong_method if
1035   // the callee is deoptimized as we race thru here. If that
1036   // happens we don't want to take a safepoint because the
1037   // caller frame will look interpreted and arguments are now
1038   // "compiled" so it is much better to make this transition
1039   // invisible to the stack walking code. Unfortunately if
1040   // we try and find the callee by normal means a safepoint
1041   // is possible. So we stash the desired callee in the thread
1042   // and the vm will find there should this case occur.
1043 
1044   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1045 
1046   // put Method* where a c2i would expect should we end up there
1047   // only needed because eof c2 resolve stubs return Method* as a result in
1048   // rax
1049   __ mov(rax, rbx);
1050   __ jmp(r11);
1051 }
1052 
1053 // ---------------------------------------------------------------
1054 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
1055                                                             int total_args_passed,
1056                                                             int comp_args_on_stack,
1057                                                             const BasicType *sig_bt,
1058                                                             const VMRegPair *regs,
1059                                                             AdapterFingerPrint* fingerprint) {
1060   address i2c_entry = __ pc();
1061 
1062   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
1063 
1064   // -------------------------------------------------------------------------
1065   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1066   // to the interpreter.  The args start out packed in the compiled layout.  They
1067   // need to be unpacked into the interpreter layout.  This will almost always
1068   // require some stack space.  We grow the current (compiled) stack, then repack
1069   // the args.  We  finally end in a jump to the generic interpreter entry point.
1070   // On exit from the interpreter, the interpreter will restore our SP (lest the
1071   // compiled code, which relies solely on SP and not RBP, get sick).
1072 
1073   address c2i_unverified_entry = __ pc();
1074   Label skip_fixup;
1075 
1076   Register data = rax;
1077   Register receiver = j_rarg0;
1078   Register temp = rbx;
1079 
1080   {
1081     __ ic_check(1 /* end_alignment */);
1082     __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1083     // Method might have been compiled since the call site was patched to
1084     // interpreted if that is the case treat it as a miss so we can get
1085     // the call site corrected.
1086     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1087     __ jcc(Assembler::equal, skip_fixup);
1088     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1089   }
1090 
1091   address c2i_entry = __ pc();
1092 
1093   // Class initialization barrier for static methods
1094   address c2i_no_clinit_check_entry = nullptr;
1095   if (VM_Version::supports_fast_class_init_checks()) {
1096     Label L_skip_barrier;
1097     Register method = rbx;
1098 
1099     { // Bypass the barrier for non-static methods
1100       Register flags = rscratch1;
1101       __ load_unsigned_short(flags, Address(method, Method::access_flags_offset()));
1102       __ testl(flags, JVM_ACC_STATIC);
1103       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1104     }
1105 
1106     Register klass = rscratch1;
1107     __ load_method_holder(klass, method);
1108     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1109 
1110     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1111 
1112     __ bind(L_skip_barrier);
1113     c2i_no_clinit_check_entry = __ pc();
1114   }
1115 
1116   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1117   bs->c2i_entry_barrier(masm);
1118 
1119   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1120 
1121   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1122 }
1123 
1124 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1125                                          VMRegPair *regs,
1126                                          int total_args_passed) {
1127 
1128 // We return the amount of VMRegImpl stack slots we need to reserve for all
1129 // the arguments NOT counting out_preserve_stack_slots.
1130 
1131 // NOTE: These arrays will have to change when c1 is ported
1132 #ifdef _WIN64
1133     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1134       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1135     };
1136     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1137       c_farg0, c_farg1, c_farg2, c_farg3
1138     };
1139 #else
1140     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1141       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1142     };
1143     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1144       c_farg0, c_farg1, c_farg2, c_farg3,
1145       c_farg4, c_farg5, c_farg6, c_farg7
1146     };
1147 #endif // _WIN64
1148 
1149 
1150     uint int_args = 0;
1151     uint fp_args = 0;
1152     uint stk_args = 0; // inc by 2 each time
1153 
1154     for (int i = 0; i < total_args_passed; i++) {
1155       switch (sig_bt[i]) {
1156       case T_BOOLEAN:
1157       case T_CHAR:
1158       case T_BYTE:
1159       case T_SHORT:
1160       case T_INT:
1161         if (int_args < Argument::n_int_register_parameters_c) {
1162           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1163 #ifdef _WIN64
1164           fp_args++;
1165           // Allocate slots for callee to stuff register args the stack.
1166           stk_args += 2;
1167 #endif
1168         } else {
1169           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1170           stk_args += 2;
1171         }
1172         break;
1173       case T_LONG:
1174         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1175         // fall through
1176       case T_OBJECT:
1177       case T_ARRAY:
1178       case T_ADDRESS:
1179       case T_METADATA:
1180         if (int_args < Argument::n_int_register_parameters_c) {
1181           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1182 #ifdef _WIN64
1183           fp_args++;
1184           stk_args += 2;
1185 #endif
1186         } else {
1187           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1188           stk_args += 2;
1189         }
1190         break;
1191       case T_FLOAT:
1192         if (fp_args < Argument::n_float_register_parameters_c) {
1193           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1194 #ifdef _WIN64
1195           int_args++;
1196           // Allocate slots for callee to stuff register args the stack.
1197           stk_args += 2;
1198 #endif
1199         } else {
1200           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1201           stk_args += 2;
1202         }
1203         break;
1204       case T_DOUBLE:
1205         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1206         if (fp_args < Argument::n_float_register_parameters_c) {
1207           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1208 #ifdef _WIN64
1209           int_args++;
1210           // Allocate slots for callee to stuff register args the stack.
1211           stk_args += 2;
1212 #endif
1213         } else {
1214           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1215           stk_args += 2;
1216         }
1217         break;
1218       case T_VOID: // Halves of longs and doubles
1219         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1220         regs[i].set_bad();
1221         break;
1222       default:
1223         ShouldNotReachHere();
1224         break;
1225       }
1226     }
1227 #ifdef _WIN64
1228   // windows abi requires that we always allocate enough stack space
1229   // for 4 64bit registers to be stored down.
1230   if (stk_args < 8) {
1231     stk_args = 8;
1232   }
1233 #endif // _WIN64
1234 
1235   return stk_args;
1236 }
1237 
1238 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1239                                              uint num_bits,
1240                                              uint total_args_passed) {
1241   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1242          "only certain vector sizes are supported for now");
1243 
1244   static const XMMRegister VEC_ArgReg[32] = {
1245      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1246      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1247     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1248     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1249   };
1250 
1251   uint stk_args = 0;
1252   uint fp_args = 0;
1253 
1254   for (uint i = 0; i < total_args_passed; i++) {
1255     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1256     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1257     regs[i].set_pair(vmreg->next(next_val), vmreg);
1258   }
1259 
1260   return stk_args;
1261 }
1262 
1263 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1264   // We always ignore the frame_slots arg and just use the space just below frame pointer
1265   // which by this time is free to use
1266   switch (ret_type) {
1267   case T_FLOAT:
1268     __ movflt(Address(rbp, -wordSize), xmm0);
1269     break;
1270   case T_DOUBLE:
1271     __ movdbl(Address(rbp, -wordSize), xmm0);
1272     break;
1273   case T_VOID:  break;
1274   default: {
1275     __ movptr(Address(rbp, -wordSize), rax);
1276     }
1277   }
1278 }
1279 
1280 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1281   // We always ignore the frame_slots arg and just use the space just below frame pointer
1282   // which by this time is free to use
1283   switch (ret_type) {
1284   case T_FLOAT:
1285     __ movflt(xmm0, Address(rbp, -wordSize));
1286     break;
1287   case T_DOUBLE:
1288     __ movdbl(xmm0, Address(rbp, -wordSize));
1289     break;
1290   case T_VOID:  break;
1291   default: {
1292     __ movptr(rax, Address(rbp, -wordSize));
1293     }
1294   }
1295 }
1296 
1297 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1298     for ( int i = first_arg ; i < arg_count ; i++ ) {
1299       if (args[i].first()->is_Register()) {
1300         __ push(args[i].first()->as_Register());
1301       } else if (args[i].first()->is_XMMRegister()) {
1302         __ subptr(rsp, 2*wordSize);
1303         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1304       }
1305     }
1306 }
1307 
1308 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1309     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1310       if (args[i].first()->is_Register()) {
1311         __ pop(args[i].first()->as_Register());
1312       } else if (args[i].first()->is_XMMRegister()) {
1313         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1314         __ addptr(rsp, 2*wordSize);
1315       }
1316     }
1317 }
1318 
1319 static void verify_oop_args(MacroAssembler* masm,
1320                             const methodHandle& method,
1321                             const BasicType* sig_bt,
1322                             const VMRegPair* regs) {
1323   Register temp_reg = rbx;  // not part of any compiled calling seq
1324   if (VerifyOops) {
1325     for (int i = 0; i < method->size_of_parameters(); i++) {
1326       if (is_reference_type(sig_bt[i])) {
1327         VMReg r = regs[i].first();
1328         assert(r->is_valid(), "bad oop arg");
1329         if (r->is_stack()) {
1330           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1331           __ verify_oop(temp_reg);
1332         } else {
1333           __ verify_oop(r->as_Register());
1334         }
1335       }
1336     }
1337   }
1338 }
1339 
1340 static void check_continuation_enter_argument(VMReg actual_vmreg,
1341                                               Register expected_reg,
1342                                               const char* name) {
1343   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1344   assert(actual_vmreg->as_Register() == expected_reg,
1345          "%s is in unexpected register: %s instead of %s",
1346          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1347 }
1348 
1349 
1350 //---------------------------- continuation_enter_setup ---------------------------
1351 //
1352 // Arguments:
1353 //   None.
1354 //
1355 // Results:
1356 //   rsp: pointer to blank ContinuationEntry
1357 //
1358 // Kills:
1359 //   rax
1360 //
1361 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1362   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1363   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1364   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1365 
1366   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1367   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1368 
1369   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1370   OopMap* map = new OopMap(frame_size, 0);
1371 
1372   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1373   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1374   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1375 
1376   return map;
1377 }
1378 
1379 //---------------------------- fill_continuation_entry ---------------------------
1380 //
1381 // Arguments:
1382 //   rsp: pointer to blank Continuation entry
1383 //   reg_cont_obj: pointer to the continuation
1384 //   reg_flags: flags
1385 //
1386 // Results:
1387 //   rsp: pointer to filled out ContinuationEntry
1388 //
1389 // Kills:
1390 //   rax
1391 //
1392 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1393   assert_different_registers(rax, reg_cont_obj, reg_flags);
1394 #ifdef ASSERT
1395   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1396 #endif
1397   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1398   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1399   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1400   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1401   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1402 
1403   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1404   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1405   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1406   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1407 
1408   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1409   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1410 }
1411 
1412 //---------------------------- continuation_enter_cleanup ---------------------------
1413 //
1414 // Arguments:
1415 //   rsp: pointer to the ContinuationEntry
1416 //
1417 // Results:
1418 //   rsp: pointer to the spilled rbp in the entry frame
1419 //
1420 // Kills:
1421 //   rbx
1422 //
1423 static void continuation_enter_cleanup(MacroAssembler* masm) {
1424 #ifdef ASSERT
1425   Label L_good_sp;
1426   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1427   __ jcc(Assembler::equal, L_good_sp);
1428   __ stop("Incorrect rsp at continuation_enter_cleanup");
1429   __ bind(L_good_sp);
1430 #endif
1431   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1432   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1433 
1434   if (CheckJNICalls) {
1435     // Check if this is a virtual thread continuation
1436     Label L_skip_vthread_code;
1437     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1438     __ jcc(Assembler::equal, L_skip_vthread_code);
1439 
1440     // If the held monitor count is > 0 and this vthread is terminating then
1441     // it failed to release a JNI monitor. So we issue the same log message
1442     // that JavaThread::exit does.
1443     __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1444     __ jcc(Assembler::equal, L_skip_vthread_code);
1445 
1446     // rax may hold an exception oop, save it before the call
1447     __ push(rax);
1448     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held));
1449     __ pop(rax);
1450 
1451     // For vthreads we have to explicitly zero the JNI monitor count of the carrier
1452     // on termination. The held count is implicitly zeroed below when we restore from
1453     // the parent held count (which has to be zero).
1454     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1455 
1456     __ bind(L_skip_vthread_code);
1457   }
1458 #ifdef ASSERT
1459   else {
1460     // Check if this is a virtual thread continuation
1461     Label L_skip_vthread_code;
1462     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1463     __ jcc(Assembler::equal, L_skip_vthread_code);
1464 
1465     // See comment just above. If not checking JNI calls the JNI count is only
1466     // needed for assertion checking.
1467     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1468 
1469     __ bind(L_skip_vthread_code);
1470   }
1471 #endif
1472 
1473   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1474   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1475 
1476   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1477   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1478   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1479 }
1480 
1481 static void gen_continuation_enter(MacroAssembler* masm,
1482                                    const VMRegPair* regs,
1483                                    int& exception_offset,
1484                                    OopMapSet* oop_maps,
1485                                    int& frame_complete,
1486                                    int& stack_slots,
1487                                    int& interpreted_entry_offset,
1488                                    int& compiled_entry_offset) {
1489 
1490   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1491   int pos_cont_obj   = 0;
1492   int pos_is_cont    = 1;
1493   int pos_is_virtual = 2;
1494 
1495   // The platform-specific calling convention may present the arguments in various registers.
1496   // To simplify the rest of the code, we expect the arguments to reside at these known
1497   // registers, and we additionally check the placement here in case calling convention ever
1498   // changes.
1499   Register reg_cont_obj   = c_rarg1;
1500   Register reg_is_cont    = c_rarg2;
1501   Register reg_is_virtual = c_rarg3;
1502 
1503   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1504   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1505   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1506 
1507   // Utility methods kill rax, make sure there are no collisions
1508   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1509 
1510   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1511                          relocInfo::static_call_type);
1512 
1513   address start = __ pc();
1514 
1515   Label L_thaw, L_exit;
1516 
1517   // i2i entry used at interp_only_mode only
1518   interpreted_entry_offset = __ pc() - start;
1519   {
1520 #ifdef ASSERT
1521     Label is_interp_only;
1522     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1523     __ jcc(Assembler::notEqual, is_interp_only);
1524     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1525     __ bind(is_interp_only);
1526 #endif
1527 
1528     __ pop(rax); // return address
1529     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1530     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1531     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1532     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1533     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1534     __ push(rax); // return address
1535     __ push_cont_fastpath();
1536 
1537     __ enter();
1538 
1539     stack_slots = 2; // will be adjusted in setup
1540     OopMap* map = continuation_enter_setup(masm, stack_slots);
1541     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1542     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1543 
1544     __ verify_oop(reg_cont_obj);
1545 
1546     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1547 
1548     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1549     __ testptr(reg_is_cont, reg_is_cont);
1550     __ jcc(Assembler::notZero, L_thaw);
1551 
1552     // --- Resolve path
1553 
1554     // Make sure the call is patchable
1555     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1556     // Emit stub for static call
1557     address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1558     if (stub == nullptr) {
1559       fatal("CodeCache is full at gen_continuation_enter");
1560     }
1561     __ call(resolve);
1562     oop_maps->add_gc_map(__ pc() - start, map);
1563     __ post_call_nop();
1564 
1565     __ jmp(L_exit);
1566   }
1567 
1568   // compiled entry
1569   __ align(CodeEntryAlignment);
1570   compiled_entry_offset = __ pc() - start;
1571   __ enter();
1572 
1573   stack_slots = 2; // will be adjusted in setup
1574   OopMap* map = continuation_enter_setup(masm, stack_slots);
1575 
1576   // Frame is now completed as far as size and linkage.
1577   frame_complete = __ pc() - start;
1578 
1579   __ verify_oop(reg_cont_obj);
1580 
1581   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1582 
1583   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1584   __ testptr(reg_is_cont, reg_is_cont);
1585   __ jccb(Assembler::notZero, L_thaw);
1586 
1587   // --- call Continuation.enter(Continuation c, boolean isContinue)
1588 
1589   // Make sure the call is patchable
1590   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1591 
1592   // Emit stub for static call
1593   address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1594   if (stub == nullptr) {
1595     fatal("CodeCache is full at gen_continuation_enter");
1596   }
1597 
1598   // The call needs to be resolved. There's a special case for this in
1599   // SharedRuntime::find_callee_info_helper() which calls
1600   // LinkResolver::resolve_continuation_enter() which resolves the call to
1601   // Continuation.enter(Continuation c, boolean isContinue).
1602   __ call(resolve);
1603 
1604   oop_maps->add_gc_map(__ pc() - start, map);
1605   __ post_call_nop();
1606 
1607   __ jmpb(L_exit);
1608 
1609   // --- Thawing path
1610 
1611   __ bind(L_thaw);
1612 
1613   ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1614   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1615 
1616   ContinuationEntry::_return_pc_offset = __ pc() - start;
1617   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1618   __ post_call_nop();
1619 
1620   // --- Normal exit (resolve/thawing)
1621 
1622   __ bind(L_exit);
1623   ContinuationEntry::_cleanup_offset = __ pc() - start;
1624   continuation_enter_cleanup(masm);
1625   __ pop(rbp);
1626   __ ret(0);
1627 
1628   // --- Exception handling path
1629 
1630   exception_offset = __ pc() - start;
1631 
1632   continuation_enter_cleanup(masm);
1633   __ pop(rbp);
1634 
1635   __ movptr(c_rarg0, r15_thread);
1636   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1637 
1638   // rax still holds the original exception oop, save it before the call
1639   __ push(rax);
1640 
1641   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1642   __ movptr(rbx, rax);
1643 
1644   // Continue at exception handler:
1645   //   rax: exception oop
1646   //   rbx: exception handler
1647   //   rdx: exception pc
1648   __ pop(rax);
1649   __ verify_oop(rax);
1650   __ pop(rdx);
1651   __ jmp(rbx);
1652 }
1653 
1654 static void gen_continuation_yield(MacroAssembler* masm,
1655                                    const VMRegPair* regs,
1656                                    OopMapSet* oop_maps,
1657                                    int& frame_complete,
1658                                    int& stack_slots,
1659                                    int& compiled_entry_offset) {
1660   enum layout {
1661     rbp_off,
1662     rbpH_off,
1663     return_off,
1664     return_off2,
1665     framesize // inclusive of return address
1666   };
1667   stack_slots = framesize /  VMRegImpl::slots_per_word;
1668   assert(stack_slots == 2, "recheck layout");
1669 
1670   address start = __ pc();
1671   compiled_entry_offset = __ pc() - start;
1672   __ enter();
1673   address the_pc = __ pc();
1674 
1675   frame_complete = the_pc - start;
1676 
1677   // This nop must be exactly at the PC we push into the frame info.
1678   // We use this nop for fast CodeBlob lookup, associate the OopMap
1679   // with it right away.
1680   __ post_call_nop();
1681   OopMap* map = new OopMap(framesize, 1);
1682   oop_maps->add_gc_map(frame_complete, map);
1683 
1684   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1685   __ movptr(c_rarg0, r15_thread);
1686   __ movptr(c_rarg1, rsp);
1687   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1688   __ reset_last_Java_frame(true);
1689 
1690   Label L_pinned;
1691 
1692   __ testptr(rax, rax);
1693   __ jcc(Assembler::notZero, L_pinned);
1694 
1695   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1696   continuation_enter_cleanup(masm);
1697   __ pop(rbp);
1698   __ ret(0);
1699 
1700   __ bind(L_pinned);
1701 
1702   // Pinned, return to caller
1703 
1704   // handle pending exception thrown by freeze
1705   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1706   Label ok;
1707   __ jcc(Assembler::equal, ok);
1708   __ leave();
1709   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1710   __ bind(ok);
1711 
1712   __ leave();
1713   __ ret(0);
1714 }
1715 
1716 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1717   ::continuation_enter_cleanup(masm);
1718 }
1719 
1720 static void gen_special_dispatch(MacroAssembler* masm,
1721                                  const methodHandle& method,
1722                                  const BasicType* sig_bt,
1723                                  const VMRegPair* regs) {
1724   verify_oop_args(masm, method, sig_bt, regs);
1725   vmIntrinsics::ID iid = method->intrinsic_id();
1726 
1727   // Now write the args into the outgoing interpreter space
1728   bool     has_receiver   = false;
1729   Register receiver_reg   = noreg;
1730   int      member_arg_pos = -1;
1731   Register member_reg     = noreg;
1732   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1733   if (ref_kind != 0) {
1734     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1735     member_reg = rbx;  // known to be free at this point
1736     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1737   } else if (iid == vmIntrinsics::_invokeBasic) {
1738     has_receiver = true;
1739   } else if (iid == vmIntrinsics::_linkToNative) {
1740     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1741     member_reg = rbx;  // known to be free at this point
1742   } else {
1743     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1744   }
1745 
1746   if (member_reg != noreg) {
1747     // Load the member_arg into register, if necessary.
1748     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1749     VMReg r = regs[member_arg_pos].first();
1750     if (r->is_stack()) {
1751       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1752     } else {
1753       // no data motion is needed
1754       member_reg = r->as_Register();
1755     }
1756   }
1757 
1758   if (has_receiver) {
1759     // Make sure the receiver is loaded into a register.
1760     assert(method->size_of_parameters() > 0, "oob");
1761     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1762     VMReg r = regs[0].first();
1763     assert(r->is_valid(), "bad receiver arg");
1764     if (r->is_stack()) {
1765       // Porting note:  This assumes that compiled calling conventions always
1766       // pass the receiver oop in a register.  If this is not true on some
1767       // platform, pick a temp and load the receiver from stack.
1768       fatal("receiver always in a register");
1769       receiver_reg = j_rarg0;  // known to be free at this point
1770       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1771     } else {
1772       // no data motion is needed
1773       receiver_reg = r->as_Register();
1774     }
1775   }
1776 
1777   // Figure out which address we are really jumping to:
1778   MethodHandles::generate_method_handle_dispatch(masm, iid,
1779                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1780 }
1781 
1782 // ---------------------------------------------------------------------------
1783 // Generate a native wrapper for a given method.  The method takes arguments
1784 // in the Java compiled code convention, marshals them to the native
1785 // convention (handlizes oops, etc), transitions to native, makes the call,
1786 // returns to java state (possibly blocking), unhandlizes any result and
1787 // returns.
1788 //
1789 // Critical native functions are a shorthand for the use of
1790 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1791 // functions.  The wrapper is expected to unpack the arguments before
1792 // passing them to the callee. Critical native functions leave the state _in_Java,
1793 // since they cannot stop for GC.
1794 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1795 // block and the check for pending exceptions it's impossible for them
1796 // to be thrown.
1797 //
1798 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1799                                                 const methodHandle& method,
1800                                                 int compile_id,
1801                                                 BasicType* in_sig_bt,
1802                                                 VMRegPair* in_regs,
1803                                                 BasicType ret_type) {
1804   if (method->is_continuation_native_intrinsic()) {
1805     int exception_offset = -1;
1806     OopMapSet* oop_maps = new OopMapSet();
1807     int frame_complete = -1;
1808     int stack_slots = -1;
1809     int interpreted_entry_offset = -1;
1810     int vep_offset = -1;
1811     if (method->is_continuation_enter_intrinsic()) {
1812       gen_continuation_enter(masm,
1813                              in_regs,
1814                              exception_offset,
1815                              oop_maps,
1816                              frame_complete,
1817                              stack_slots,
1818                              interpreted_entry_offset,
1819                              vep_offset);
1820     } else if (method->is_continuation_yield_intrinsic()) {
1821       gen_continuation_yield(masm,
1822                              in_regs,
1823                              oop_maps,
1824                              frame_complete,
1825                              stack_slots,
1826                              vep_offset);
1827     } else {
1828       guarantee(false, "Unknown Continuation native intrinsic");
1829     }
1830 
1831 #ifdef ASSERT
1832     if (method->is_continuation_enter_intrinsic()) {
1833       assert(interpreted_entry_offset != -1, "Must be set");
1834       assert(exception_offset != -1,         "Must be set");
1835     } else {
1836       assert(interpreted_entry_offset == -1, "Must be unset");
1837       assert(exception_offset == -1,         "Must be unset");
1838     }
1839     assert(frame_complete != -1,    "Must be set");
1840     assert(stack_slots != -1,       "Must be set");
1841     assert(vep_offset != -1,        "Must be set");
1842 #endif
1843 
1844     __ flush();
1845     nmethod* nm = nmethod::new_native_nmethod(method,
1846                                               compile_id,
1847                                               masm->code(),
1848                                               vep_offset,
1849                                               frame_complete,
1850                                               stack_slots,
1851                                               in_ByteSize(-1),
1852                                               in_ByteSize(-1),
1853                                               oop_maps,
1854                                               exception_offset);
1855     if (nm == nullptr) return nm;
1856     if (method->is_continuation_enter_intrinsic()) {
1857       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1858     } else if (method->is_continuation_yield_intrinsic()) {
1859       _cont_doYield_stub = nm;
1860     }
1861     return nm;
1862   }
1863 
1864   if (method->is_method_handle_intrinsic()) {
1865     vmIntrinsics::ID iid = method->intrinsic_id();
1866     intptr_t start = (intptr_t)__ pc();
1867     int vep_offset = ((intptr_t)__ pc()) - start;
1868     gen_special_dispatch(masm,
1869                          method,
1870                          in_sig_bt,
1871                          in_regs);
1872     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1873     __ flush();
1874     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1875     return nmethod::new_native_nmethod(method,
1876                                        compile_id,
1877                                        masm->code(),
1878                                        vep_offset,
1879                                        frame_complete,
1880                                        stack_slots / VMRegImpl::slots_per_word,
1881                                        in_ByteSize(-1),
1882                                        in_ByteSize(-1),
1883                                        nullptr);
1884   }
1885   address native_func = method->native_function();
1886   assert(native_func != nullptr, "must have function");
1887 
1888   // An OopMap for lock (and class if static)
1889   OopMapSet *oop_maps = new OopMapSet();
1890   intptr_t start = (intptr_t)__ pc();
1891 
1892   // We have received a description of where all the java arg are located
1893   // on entry to the wrapper. We need to convert these args to where
1894   // the jni function will expect them. To figure out where they go
1895   // we convert the java signature to a C signature by inserting
1896   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1897 
1898   const int total_in_args = method->size_of_parameters();
1899   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1900 
1901   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1902   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1903 
1904   int argc = 0;
1905   out_sig_bt[argc++] = T_ADDRESS;
1906   if (method->is_static()) {
1907     out_sig_bt[argc++] = T_OBJECT;
1908   }
1909 
1910   for (int i = 0; i < total_in_args ; i++ ) {
1911     out_sig_bt[argc++] = in_sig_bt[i];
1912   }
1913 
1914   // Now figure out where the args must be stored and how much stack space
1915   // they require.
1916   int out_arg_slots;
1917   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
1918 
1919   // Compute framesize for the wrapper.  We need to handlize all oops in
1920   // incoming registers
1921 
1922   // Calculate the total number of stack slots we will need.
1923 
1924   // First count the abi requirement plus all of the outgoing args
1925   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1926 
1927   // Now the space for the inbound oop handle area
1928   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1929 
1930   int oop_handle_offset = stack_slots;
1931   stack_slots += total_save_slots;
1932 
1933   // Now any space we need for handlizing a klass if static method
1934 
1935   int klass_slot_offset = 0;
1936   int klass_offset = -1;
1937   int lock_slot_offset = 0;
1938   bool is_static = false;
1939 
1940   if (method->is_static()) {
1941     klass_slot_offset = stack_slots;
1942     stack_slots += VMRegImpl::slots_per_word;
1943     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1944     is_static = true;
1945   }
1946 
1947   // Plus a lock if needed
1948 
1949   if (method->is_synchronized()) {
1950     lock_slot_offset = stack_slots;
1951     stack_slots += VMRegImpl::slots_per_word;
1952   }
1953 
1954   // Now a place (+2) to save return values or temp during shuffling
1955   // + 4 for return address (which we own) and saved rbp
1956   stack_slots += 6;
1957 
1958   // Ok The space we have allocated will look like:
1959   //
1960   //
1961   // FP-> |                     |
1962   //      |---------------------|
1963   //      | 2 slots for moves   |
1964   //      |---------------------|
1965   //      | lock box (if sync)  |
1966   //      |---------------------| <- lock_slot_offset
1967   //      | klass (if static)   |
1968   //      |---------------------| <- klass_slot_offset
1969   //      | oopHandle area      |
1970   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1971   //      | outbound memory     |
1972   //      | based arguments     |
1973   //      |                     |
1974   //      |---------------------|
1975   //      |                     |
1976   // SP-> | out_preserved_slots |
1977   //
1978   //
1979 
1980 
1981   // Now compute actual number of stack words we need rounding to make
1982   // stack properly aligned.
1983   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1984 
1985   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1986 
1987   // First thing make an ic check to see if we should even be here
1988 
1989   // We are free to use all registers as temps without saving them and
1990   // restoring them except rbp. rbp is the only callee save register
1991   // as far as the interpreter and the compiler(s) are concerned.
1992 
1993   const Register receiver = j_rarg0;
1994 
1995   Label exception_pending;
1996 
1997   assert_different_registers(receiver, rscratch1, rscratch2);
1998   __ verify_oop(receiver);
1999   __ ic_check(8 /* end_alignment */);
2000 
2001   int vep_offset = ((intptr_t)__ pc()) - start;
2002 
2003   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
2004     Label L_skip_barrier;
2005     Register klass = r10;
2006     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
2007     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
2008 
2009     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
2010 
2011     __ bind(L_skip_barrier);
2012   }
2013 
2014 #ifdef COMPILER1
2015   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2016   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2017     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2018   }
2019 #endif // COMPILER1
2020 
2021   // The instruction at the verified entry point must be 5 bytes or longer
2022   // because it can be patched on the fly by make_non_entrant. The stack bang
2023   // instruction fits that requirement.
2024 
2025   // Generate stack overflow check
2026   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
2027 
2028   // Generate a new frame for the wrapper.
2029   __ enter();
2030   // -2 because return address is already present and so is saved rbp
2031   __ subptr(rsp, stack_size - 2*wordSize);
2032 
2033   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2034   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
2035   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
2036 
2037   // Frame is now completed as far as size and linkage.
2038   int frame_complete = ((intptr_t)__ pc()) - start;
2039 
2040 #ifdef ASSERT
2041   __ check_stack_alignment(rsp, "improperly aligned stack");
2042 #endif /* ASSERT */
2043 
2044 
2045   // We use r14 as the oop handle for the receiver/klass
2046   // It is callee save so it survives the call to native
2047 
2048   const Register oop_handle_reg = r14;
2049 
2050   //
2051   // We immediately shuffle the arguments so that any vm call we have to
2052   // make from here on out (sync slow path, jvmti, etc.) we will have
2053   // captured the oops from our caller and have a valid oopMap for
2054   // them.
2055 
2056   // -----------------
2057   // The Grand Shuffle
2058 
2059   // The Java calling convention is either equal (linux) or denser (win64) than the
2060   // c calling convention. However the because of the jni_env argument the c calling
2061   // convention always has at least one more (and two for static) arguments than Java.
2062   // Therefore if we move the args from java -> c backwards then we will never have
2063   // a register->register conflict and we don't have to build a dependency graph
2064   // and figure out how to break any cycles.
2065   //
2066 
2067   // Record esp-based slot for receiver on stack for non-static methods
2068   int receiver_offset = -1;
2069 
2070   // This is a trick. We double the stack slots so we can claim
2071   // the oops in the caller's frame. Since we are sure to have
2072   // more args than the caller doubling is enough to make
2073   // sure we can capture all the incoming oop args from the
2074   // caller.
2075   //
2076   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2077 
2078   // Mark location of rbp (someday)
2079   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2080 
2081   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2082   // All inbound args are referenced based on rbp and all outbound args via rsp.
2083 
2084 
2085 #ifdef ASSERT
2086   bool reg_destroyed[Register::number_of_registers];
2087   bool freg_destroyed[XMMRegister::number_of_registers];
2088   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2089     reg_destroyed[r] = false;
2090   }
2091   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2092     freg_destroyed[f] = false;
2093   }
2094 
2095 #endif /* ASSERT */
2096 
2097   // For JNI natives the incoming and outgoing registers are offset upwards.
2098   GrowableArray<int> arg_order(2 * total_in_args);
2099 
2100   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2101     arg_order.push(i);
2102     arg_order.push(c_arg);
2103   }
2104 
2105   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2106     int i = arg_order.at(ai);
2107     int c_arg = arg_order.at(ai + 1);
2108     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2109 #ifdef ASSERT
2110     if (in_regs[i].first()->is_Register()) {
2111       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2112     } else if (in_regs[i].first()->is_XMMRegister()) {
2113       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2114     }
2115     if (out_regs[c_arg].first()->is_Register()) {
2116       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2117     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2118       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2119     }
2120 #endif /* ASSERT */
2121     switch (in_sig_bt[i]) {
2122       case T_ARRAY:
2123       case T_OBJECT:
2124         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2125                     ((i == 0) && (!is_static)),
2126                     &receiver_offset);
2127         break;
2128       case T_VOID:
2129         break;
2130 
2131       case T_FLOAT:
2132         __ float_move(in_regs[i], out_regs[c_arg]);
2133           break;
2134 
2135       case T_DOUBLE:
2136         assert( i + 1 < total_in_args &&
2137                 in_sig_bt[i + 1] == T_VOID &&
2138                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2139         __ double_move(in_regs[i], out_regs[c_arg]);
2140         break;
2141 
2142       case T_LONG :
2143         __ long_move(in_regs[i], out_regs[c_arg]);
2144         break;
2145 
2146       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2147 
2148       default:
2149         __ move32_64(in_regs[i], out_regs[c_arg]);
2150     }
2151   }
2152 
2153   int c_arg;
2154 
2155   // Pre-load a static method's oop into r14.  Used both by locking code and
2156   // the normal JNI call code.
2157   // point c_arg at the first arg that is already loaded in case we
2158   // need to spill before we call out
2159   c_arg = total_c_args - total_in_args;
2160 
2161   if (method->is_static()) {
2162 
2163     //  load oop into a register
2164     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2165 
2166     // Now handlize the static class mirror it's known not-null.
2167     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2168     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2169 
2170     // Now get the handle
2171     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2172     // store the klass handle as second argument
2173     __ movptr(c_rarg1, oop_handle_reg);
2174     // and protect the arg if we must spill
2175     c_arg--;
2176   }
2177 
2178   // Change state to native (we save the return address in the thread, since it might not
2179   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2180   // points into the right code segment. It does not have to be the correct return pc.
2181   // We use the same pc/oopMap repeatedly when we call out
2182 
2183   Label native_return;
2184   if (LockingMode != LM_LEGACY && method->is_object_wait0()) {
2185     // For convenience we use the pc we want to resume to in case of preemption on Object.wait.
2186     __ set_last_Java_frame(rsp, noreg, native_return, rscratch1);
2187   } else {
2188     intptr_t the_pc = (intptr_t) __ pc();
2189     oop_maps->add_gc_map(the_pc - start, map);
2190 
2191     __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1);
2192   }
2193 
2194   // We have all of the arguments setup at this point. We must not touch any register
2195   // argument registers at this point (what if we save/restore them there are no oop?
2196 
2197   if (DTraceMethodProbes) {
2198     // protect the args we've loaded
2199     save_args(masm, total_c_args, c_arg, out_regs);
2200     __ mov_metadata(c_rarg1, method());
2201     __ call_VM_leaf(
2202       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2203       r15_thread, c_rarg1);
2204     restore_args(masm, total_c_args, c_arg, out_regs);
2205   }
2206 
2207   // RedefineClasses() tracing support for obsolete method entry
2208   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2209     // protect the args we've loaded
2210     save_args(masm, total_c_args, c_arg, out_regs);
2211     __ mov_metadata(c_rarg1, method());
2212     __ call_VM_leaf(
2213       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2214       r15_thread, c_rarg1);
2215     restore_args(masm, total_c_args, c_arg, out_regs);
2216   }
2217 
2218   // Lock a synchronized method
2219 
2220   // Register definitions used by locking and unlocking
2221 
2222   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2223   const Register obj_reg  = rbx;  // Will contain the oop
2224   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2225   const Register old_hdr  = r13;  // value of old header at unlock time
2226 
2227   Label slow_path_lock;
2228   Label lock_done;
2229 
2230   if (method->is_synchronized()) {
2231     Label count_mon;
2232 
2233     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2234 
2235     // Get the handle (the 2nd argument)
2236     __ mov(oop_handle_reg, c_rarg1);
2237 
2238     // Get address of the box
2239 
2240     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2241 
2242     // Load the oop from the handle
2243     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2244 
2245     if (LockingMode == LM_MONITOR) {
2246       __ jmp(slow_path_lock);
2247     } else if (LockingMode == LM_LEGACY) {
2248       // Load immediate 1 into swap_reg %rax
2249       __ movl(swap_reg, 1);
2250 
2251       // Load (object->mark() | 1) into swap_reg %rax
2252       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2253 
2254       // Save (object->mark() | 1) into BasicLock's displaced header
2255       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2256 
2257       // src -> dest iff dest == rax else rax <- dest
2258       __ lock();
2259       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2260       __ jcc(Assembler::equal, count_mon);
2261 
2262       // Hmm should this move to the slow path code area???
2263 
2264       // Test if the oopMark is an obvious stack pointer, i.e.,
2265       //  1) (mark & 3) == 0, and
2266       //  2) rsp <= mark < mark + os::pagesize()
2267       // These 3 tests can be done by evaluating the following
2268       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2269       // assuming both stack pointer and pagesize have their
2270       // least significant 2 bits clear.
2271       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2272 
2273       __ subptr(swap_reg, rsp);
2274       __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2275 
2276       // Save the test result, for recursive case, the result is zero
2277       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2278       __ jcc(Assembler::notEqual, slow_path_lock);
2279 
2280       __ bind(count_mon);
2281       __ inc_held_monitor_count();
2282     } else {
2283       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2284       __ lightweight_lock(lock_reg, obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock);
2285     }
2286 
2287     // Slow path will re-enter here
2288     __ bind(lock_done);
2289   }
2290 
2291   // Finally just about ready to make the JNI call
2292 
2293   // get JNIEnv* which is first argument to native
2294   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2295 
2296   // Now set thread in native
2297   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2298 
2299   __ call(RuntimeAddress(native_func));
2300 
2301   // Verify or restore cpu control state after JNI call
2302   __ restore_cpu_control_state_after_jni(rscratch1);
2303 
2304   // Unpack native results.
2305   switch (ret_type) {
2306   case T_BOOLEAN: __ c2bool(rax);            break;
2307   case T_CHAR   : __ movzwl(rax, rax);      break;
2308   case T_BYTE   : __ sign_extend_byte (rax); break;
2309   case T_SHORT  : __ sign_extend_short(rax); break;
2310   case T_INT    : /* nothing to do */        break;
2311   case T_DOUBLE :
2312   case T_FLOAT  :
2313     // Result is in xmm0 we'll save as needed
2314     break;
2315   case T_ARRAY:                 // Really a handle
2316   case T_OBJECT:                // Really a handle
2317       break; // can't de-handlize until after safepoint check
2318   case T_VOID: break;
2319   case T_LONG: break;
2320   default       : ShouldNotReachHere();
2321   }
2322 
2323   // Switch thread to "native transition" state before reading the synchronization state.
2324   // This additional state is necessary because reading and testing the synchronization
2325   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2326   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2327   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2328   //     Thread A is resumed to finish this native method, but doesn't block here since it
2329   //     didn't see any synchronization is progress, and escapes.
2330   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2331 
2332   // Force this write out before the read below
2333   if (!UseSystemMemoryBarrier) {
2334     __ membar(Assembler::Membar_mask_bits(
2335               Assembler::LoadLoad | Assembler::LoadStore |
2336               Assembler::StoreLoad | Assembler::StoreStore));
2337   }
2338 
2339   // check for safepoint operation in progress and/or pending suspend requests
2340   {
2341     Label Continue;
2342     Label slow_path;
2343 
2344     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2345 
2346     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2347     __ jcc(Assembler::equal, Continue);
2348     __ bind(slow_path);
2349 
2350     // Don't use call_VM as it will see a possible pending exception and forward it
2351     // and never return here preventing us from clearing _last_native_pc down below.
2352     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2353     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2354     // by hand.
2355     //
2356     __ vzeroupper();
2357     save_native_result(masm, ret_type, stack_slots);
2358     __ mov(c_rarg0, r15_thread);
2359     __ mov(r12, rsp); // remember sp
2360     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2361     __ andptr(rsp, -16); // align stack as required by ABI
2362     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2363     __ mov(rsp, r12); // restore sp
2364     __ reinit_heapbase();
2365     // Restore any method result value
2366     restore_native_result(masm, ret_type, stack_slots);
2367     __ bind(Continue);
2368   }
2369 
2370   // change thread state
2371   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2372 
2373   if (LockingMode != LM_LEGACY && method->is_object_wait0()) {
2374     // Check preemption for Object.wait()
2375     __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset()));
2376     __ cmpptr(rscratch1, NULL_WORD);
2377     __ jccb(Assembler::equal, native_return);
2378     __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD);
2379     __ jmp(rscratch1);
2380     __ bind(native_return);
2381 
2382     intptr_t the_pc = (intptr_t) __ pc();
2383     oop_maps->add_gc_map(the_pc - start, map);
2384   }
2385 
2386 
2387   Label reguard;
2388   Label reguard_done;
2389   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2390   __ jcc(Assembler::equal, reguard);
2391   __ bind(reguard_done);
2392 
2393   // native result if any is live
2394 
2395   // Unlock
2396   Label slow_path_unlock;
2397   Label unlock_done;
2398   if (method->is_synchronized()) {
2399 
2400     Label fast_done;
2401 
2402     // Get locked oop from the handle we passed to jni
2403     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2404 
2405     if (LockingMode == LM_LEGACY) {
2406       Label not_recur;
2407       // Simple recursive lock?
2408       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2409       __ jcc(Assembler::notEqual, not_recur);
2410       __ dec_held_monitor_count();
2411       __ jmpb(fast_done);
2412       __ bind(not_recur);
2413     }
2414 
2415     // Must save rax if it is live now because cmpxchg must use it
2416     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2417       save_native_result(masm, ret_type, stack_slots);
2418     }
2419 
2420     if (LockingMode == LM_MONITOR) {
2421       __ jmp(slow_path_unlock);
2422     } else if (LockingMode == LM_LEGACY) {
2423       // get address of the stack lock
2424       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2425       //  get old displaced header
2426       __ movptr(old_hdr, Address(rax, 0));
2427 
2428       // Atomic swap old header if oop still contains the stack lock
2429       __ lock();
2430       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2431       __ jcc(Assembler::notEqual, slow_path_unlock);
2432       __ dec_held_monitor_count();
2433     } else {
2434       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2435       __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock);
2436     }
2437 
2438     // slow path re-enters here
2439     __ bind(unlock_done);
2440     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2441       restore_native_result(masm, ret_type, stack_slots);
2442     }
2443 
2444     __ bind(fast_done);
2445   }
2446   if (DTraceMethodProbes) {
2447     save_native_result(masm, ret_type, stack_slots);
2448     __ mov_metadata(c_rarg1, method());
2449     __ call_VM_leaf(
2450          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2451          r15_thread, c_rarg1);
2452     restore_native_result(masm, ret_type, stack_slots);
2453   }
2454 
2455   __ reset_last_Java_frame(false);
2456 
2457   // Unbox oop result, e.g. JNIHandles::resolve value.
2458   if (is_reference_type(ret_type)) {
2459     __ resolve_jobject(rax /* value */,
2460                        r15_thread /* thread */,
2461                        rcx /* tmp */);
2462   }
2463 
2464   if (CheckJNICalls) {
2465     // clear_pending_jni_exception_check
2466     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2467   }
2468 
2469   // reset handle block
2470   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2471   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2472 
2473   // pop our frame
2474 
2475   __ leave();
2476 
2477   // Any exception pending?
2478   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2479   __ jcc(Assembler::notEqual, exception_pending);
2480 
2481   // Return
2482 
2483   __ ret(0);
2484 
2485   // Unexpected paths are out of line and go here
2486 
2487   // forward the exception
2488   __ bind(exception_pending);
2489 
2490   // and forward the exception
2491   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2492 
2493   // Slow path locking & unlocking
2494   if (method->is_synchronized()) {
2495 
2496     // BEGIN Slow path lock
2497     __ bind(slow_path_lock);
2498 
2499     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2500     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2501 
2502     // protect the args we've loaded
2503     save_args(masm, total_c_args, c_arg, out_regs);
2504 
2505     __ mov(c_rarg0, obj_reg);
2506     __ mov(c_rarg1, lock_reg);
2507     __ mov(c_rarg2, r15_thread);
2508 
2509     // Not a leaf but we have last_Java_frame setup as we want.
2510     // We don't want to unmount in case of contention since that would complicate preserving
2511     // the arguments that had already been marshalled into the native convention. So we force
2512     // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame())
2513     // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack.
2514     __ push_cont_fastpath();
2515     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2516     __ pop_cont_fastpath();
2517     restore_args(masm, total_c_args, c_arg, out_regs);
2518 
2519 #ifdef ASSERT
2520     { Label L;
2521     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2522     __ jcc(Assembler::equal, L);
2523     __ stop("no pending exception allowed on exit from monitorenter");
2524     __ bind(L);
2525     }
2526 #endif
2527     __ jmp(lock_done);
2528 
2529     // END Slow path lock
2530 
2531     // BEGIN Slow path unlock
2532     __ bind(slow_path_unlock);
2533 
2534     // If we haven't already saved the native result we must save it now as xmm registers
2535     // are still exposed.
2536     __ vzeroupper();
2537     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2538       save_native_result(masm, ret_type, stack_slots);
2539     }
2540 
2541     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2542 
2543     __ mov(c_rarg0, obj_reg);
2544     __ mov(c_rarg2, r15_thread);
2545     __ mov(r12, rsp); // remember sp
2546     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2547     __ andptr(rsp, -16); // align stack as required by ABI
2548 
2549     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2550     // NOTE that obj_reg == rbx currently
2551     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2552     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2553 
2554     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2555     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2556     __ mov(rsp, r12); // restore sp
2557     __ reinit_heapbase();
2558 #ifdef ASSERT
2559     {
2560       Label L;
2561       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2562       __ jcc(Assembler::equal, L);
2563       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2564       __ bind(L);
2565     }
2566 #endif /* ASSERT */
2567 
2568     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2569 
2570     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2571       restore_native_result(masm, ret_type, stack_slots);
2572     }
2573     __ jmp(unlock_done);
2574 
2575     // END Slow path unlock
2576 
2577   } // synchronized
2578 
2579   // SLOW PATH Reguard the stack if needed
2580 
2581   __ bind(reguard);
2582   __ vzeroupper();
2583   save_native_result(masm, ret_type, stack_slots);
2584   __ mov(r12, rsp); // remember sp
2585   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2586   __ andptr(rsp, -16); // align stack as required by ABI
2587   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2588   __ mov(rsp, r12); // restore sp
2589   __ reinit_heapbase();
2590   restore_native_result(masm, ret_type, stack_slots);
2591   // and continue
2592   __ jmp(reguard_done);
2593 
2594 
2595 
2596   __ flush();
2597 
2598   nmethod *nm = nmethod::new_native_nmethod(method,
2599                                             compile_id,
2600                                             masm->code(),
2601                                             vep_offset,
2602                                             frame_complete,
2603                                             stack_slots / VMRegImpl::slots_per_word,
2604                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2605                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2606                                             oop_maps);
2607 
2608   return nm;
2609 }
2610 
2611 // this function returns the adjust size (in number of words) to a c2i adapter
2612 // activation for use during deoptimization
2613 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2614   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2615 }
2616 
2617 
2618 uint SharedRuntime::out_preserve_stack_slots() {
2619   return 0;
2620 }
2621 
2622 
2623 // Number of stack slots between incoming argument block and the start of
2624 // a new frame.  The PROLOG must add this many slots to the stack.  The
2625 // EPILOG must remove this many slots.  amd64 needs two slots for
2626 // return address.
2627 uint SharedRuntime::in_preserve_stack_slots() {
2628   return 4 + 2 * VerifyStackAtCalls;
2629 }
2630 
2631 VMReg SharedRuntime::thread_register() {
2632   return r15_thread->as_VMReg();
2633 }
2634 
2635 //------------------------------generate_deopt_blob----------------------------
2636 void SharedRuntime::generate_deopt_blob() {
2637   // Allocate space for the code
2638   ResourceMark rm;
2639   // Setup code generation tools
2640   int pad = 0;
2641   if (UseAVX > 2) {
2642     pad += 1024;
2643   }
2644   if (UseAPX) {
2645     pad += 1024;
2646   }
2647 #if INCLUDE_JVMCI
2648   if (EnableJVMCI) {
2649     pad += 512; // Increase the buffer size when compiling for JVMCI
2650   }
2651 #endif
2652   const char* name = SharedRuntime::stub_name(SharedStubId::deopt_id);
2653   CodeBuffer buffer(name, 2560+pad, 1024);
2654   MacroAssembler* masm = new MacroAssembler(&buffer);
2655   int frame_size_in_words;
2656   OopMap* map = nullptr;
2657   OopMapSet *oop_maps = new OopMapSet();
2658 
2659   // -------------
2660   // This code enters when returning to a de-optimized nmethod.  A return
2661   // address has been pushed on the stack, and return values are in
2662   // registers.
2663   // If we are doing a normal deopt then we were called from the patched
2664   // nmethod from the point we returned to the nmethod. So the return
2665   // address on the stack is wrong by NativeCall::instruction_size
2666   // We will adjust the value so it looks like we have the original return
2667   // address on the stack (like when we eagerly deoptimized).
2668   // In the case of an exception pending when deoptimizing, we enter
2669   // with a return address on the stack that points after the call we patched
2670   // into the exception handler. We have the following register state from,
2671   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2672   //    rax: exception oop
2673   //    rbx: exception handler
2674   //    rdx: throwing pc
2675   // So in this case we simply jam rdx into the useless return address and
2676   // the stack looks just like we want.
2677   //
2678   // At this point we need to de-opt.  We save the argument return
2679   // registers.  We call the first C routine, fetch_unroll_info().  This
2680   // routine captures the return values and returns a structure which
2681   // describes the current frame size and the sizes of all replacement frames.
2682   // The current frame is compiled code and may contain many inlined
2683   // functions, each with their own JVM state.  We pop the current frame, then
2684   // push all the new frames.  Then we call the C routine unpack_frames() to
2685   // populate these frames.  Finally unpack_frames() returns us the new target
2686   // address.  Notice that callee-save registers are BLOWN here; they have
2687   // already been captured in the vframeArray at the time the return PC was
2688   // patched.
2689   address start = __ pc();
2690   Label cont;
2691 
2692   // Prolog for non exception case!
2693 
2694   // Save everything in sight.
2695   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2696 
2697   // Normal deoptimization.  Save exec mode for unpack_frames.
2698   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2699   __ jmp(cont);
2700 
2701   int reexecute_offset = __ pc() - start;
2702 #if INCLUDE_JVMCI && !defined(COMPILER1)
2703   if (UseJVMCICompiler) {
2704     // JVMCI does not use this kind of deoptimization
2705     __ should_not_reach_here();
2706   }
2707 #endif
2708 
2709   // Reexecute case
2710   // return address is the pc describes what bci to do re-execute at
2711 
2712   // No need to update map as each call to save_live_registers will produce identical oopmap
2713   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2714 
2715   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2716   __ jmp(cont);
2717 
2718 #if INCLUDE_JVMCI
2719   Label after_fetch_unroll_info_call;
2720   int implicit_exception_uncommon_trap_offset = 0;
2721   int uncommon_trap_offset = 0;
2722 
2723   if (EnableJVMCI) {
2724     implicit_exception_uncommon_trap_offset = __ pc() - start;
2725 
2726     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2727     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2728 
2729     uncommon_trap_offset = __ pc() - start;
2730 
2731     // Save everything in sight.
2732     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2733     // fetch_unroll_info needs to call last_java_frame()
2734     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2735 
2736     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2737     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2738 
2739     __ movl(r14, Deoptimization::Unpack_reexecute);
2740     __ mov(c_rarg0, r15_thread);
2741     __ movl(c_rarg2, r14); // exec mode
2742     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2743     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2744 
2745     __ reset_last_Java_frame(false);
2746 
2747     __ jmp(after_fetch_unroll_info_call);
2748   } // EnableJVMCI
2749 #endif // INCLUDE_JVMCI
2750 
2751   int exception_offset = __ pc() - start;
2752 
2753   // Prolog for exception case
2754 
2755   // all registers are dead at this entry point, except for rax, and
2756   // rdx which contain the exception oop and exception pc
2757   // respectively.  Set them in TLS and fall thru to the
2758   // unpack_with_exception_in_tls entry point.
2759 
2760   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2761   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2762 
2763   int exception_in_tls_offset = __ pc() - start;
2764 
2765   // new implementation because exception oop is now passed in JavaThread
2766 
2767   // Prolog for exception case
2768   // All registers must be preserved because they might be used by LinearScan
2769   // Exceptiop oop and throwing PC are passed in JavaThread
2770   // tos: stack at point of call to method that threw the exception (i.e. only
2771   // args are on the stack, no return address)
2772 
2773   // make room on stack for the return address
2774   // It will be patched later with the throwing pc. The correct value is not
2775   // available now because loading it from memory would destroy registers.
2776   __ push(0);
2777 
2778   // Save everything in sight.
2779   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2780 
2781   // Now it is safe to overwrite any register
2782 
2783   // Deopt during an exception.  Save exec mode for unpack_frames.
2784   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2785 
2786   // load throwing pc from JavaThread and patch it as the return address
2787   // of the current frame. Then clear the field in JavaThread
2788 
2789   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2790   __ movptr(Address(rbp, wordSize), rdx);
2791   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2792 
2793 #ifdef ASSERT
2794   // verify that there is really an exception oop in JavaThread
2795   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2796   __ verify_oop(rax);
2797 
2798   // verify that there is no pending exception
2799   Label no_pending_exception;
2800   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2801   __ testptr(rax, rax);
2802   __ jcc(Assembler::zero, no_pending_exception);
2803   __ stop("must not have pending exception here");
2804   __ bind(no_pending_exception);
2805 #endif
2806 
2807   __ bind(cont);
2808 
2809   // Call C code.  Need thread and this frame, but NOT official VM entry
2810   // crud.  We cannot block on this call, no GC can happen.
2811   //
2812   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2813 
2814   // fetch_unroll_info needs to call last_java_frame().
2815 
2816   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2817 #ifdef ASSERT
2818   { Label L;
2819     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2820     __ jcc(Assembler::equal, L);
2821     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2822     __ bind(L);
2823   }
2824 #endif // ASSERT
2825   __ mov(c_rarg0, r15_thread);
2826   __ movl(c_rarg1, r14); // exec_mode
2827   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2828 
2829   // Need to have an oopmap that tells fetch_unroll_info where to
2830   // find any register it might need.
2831   oop_maps->add_gc_map(__ pc() - start, map);
2832 
2833   __ reset_last_Java_frame(false);
2834 
2835 #if INCLUDE_JVMCI
2836   if (EnableJVMCI) {
2837     __ bind(after_fetch_unroll_info_call);
2838   }
2839 #endif
2840 
2841   // Load UnrollBlock* into rdi
2842   __ mov(rdi, rax);
2843 
2844   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2845    Label noException;
2846   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2847   __ jcc(Assembler::notEqual, noException);
2848   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2849   // QQQ this is useless it was null above
2850   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2851   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2852   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2853 
2854   __ verify_oop(rax);
2855 
2856   // Overwrite the result registers with the exception results.
2857   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2858   // I think this is useless
2859   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2860 
2861   __ bind(noException);
2862 
2863   // Only register save data is on the stack.
2864   // Now restore the result registers.  Everything else is either dead
2865   // or captured in the vframeArray.
2866   RegisterSaver::restore_result_registers(masm);
2867 
2868   // All of the register save area has been popped of the stack. Only the
2869   // return address remains.
2870 
2871   // Pop all the frames we must move/replace.
2872   //
2873   // Frame picture (youngest to oldest)
2874   // 1: self-frame (no frame link)
2875   // 2: deopting frame  (no frame link)
2876   // 3: caller of deopting frame (could be compiled/interpreted).
2877   //
2878   // Note: by leaving the return address of self-frame on the stack
2879   // and using the size of frame 2 to adjust the stack
2880   // when we are done the return to frame 3 will still be on the stack.
2881 
2882   // Pop deoptimized frame
2883   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2884   __ addptr(rsp, rcx);
2885 
2886   // rsp should be pointing at the return address to the caller (3)
2887 
2888   // Pick up the initial fp we should save
2889   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2890   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2891 
2892 #ifdef ASSERT
2893   // Compilers generate code that bang the stack by as much as the
2894   // interpreter would need. So this stack banging should never
2895   // trigger a fault. Verify that it does not on non product builds.
2896   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2897   __ bang_stack_size(rbx, rcx);
2898 #endif
2899 
2900   // Load address of array of frame pcs into rcx
2901   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2902 
2903   // Trash the old pc
2904   __ addptr(rsp, wordSize);
2905 
2906   // Load address of array of frame sizes into rsi
2907   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2908 
2909   // Load counter into rdx
2910   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2911 
2912   // Now adjust the caller's stack to make up for the extra locals
2913   // but record the original sp so that we can save it in the skeletal interpreter
2914   // frame and the stack walking of interpreter_sender will get the unextended sp
2915   // value and not the "real" sp value.
2916 
2917   const Register sender_sp = r8;
2918 
2919   __ mov(sender_sp, rsp);
2920   __ movl(rbx, Address(rdi,
2921                        Deoptimization::UnrollBlock::
2922                        caller_adjustment_offset()));
2923   __ subptr(rsp, rbx);
2924 
2925   // Push interpreter frames in a loop
2926   Label loop;
2927   __ bind(loop);
2928   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2929   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2930   __ pushptr(Address(rcx, 0));          // Save return address
2931   __ enter();                           // Save old & set new ebp
2932   __ subptr(rsp, rbx);                  // Prolog
2933   // This value is corrected by layout_activation_impl
2934   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2935   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2936   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2937   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2938   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2939   __ decrementl(rdx);                   // Decrement counter
2940   __ jcc(Assembler::notZero, loop);
2941   __ pushptr(Address(rcx, 0));          // Save final return address
2942 
2943   // Re-push self-frame
2944   __ enter();                           // Save old & set new ebp
2945 
2946   // Allocate a full sized register save area.
2947   // Return address and rbp are in place, so we allocate two less words.
2948   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2949 
2950   // Restore frame locals after moving the frame
2951   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2952   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2953 
2954   // Call C code.  Need thread but NOT official VM entry
2955   // crud.  We cannot block on this call, no GC can happen.  Call should
2956   // restore return values to their stack-slots with the new SP.
2957   //
2958   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2959 
2960   // Use rbp because the frames look interpreted now
2961   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2962   // Don't need the precise return PC here, just precise enough to point into this code blob.
2963   address the_pc = __ pc();
2964   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2965 
2966   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2967   __ mov(c_rarg0, r15_thread);
2968   __ movl(c_rarg1, r14); // second arg: exec_mode
2969   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2970   // Revert SP alignment after call since we're going to do some SP relative addressing below
2971   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2972 
2973   // Set an oopmap for the call site
2974   // Use the same PC we used for the last java frame
2975   oop_maps->add_gc_map(the_pc - start,
2976                        new OopMap( frame_size_in_words, 0 ));
2977 
2978   // Clear fp AND pc
2979   __ reset_last_Java_frame(true);
2980 
2981   // Collect return values
2982   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2983   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2984   // I think this is useless (throwing pc?)
2985   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2986 
2987   // Pop self-frame.
2988   __ leave();                           // Epilog
2989 
2990   // Jump to interpreter
2991   __ ret(0);
2992 
2993   // Make sure all code is generated
2994   masm->flush();
2995 
2996   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2997   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2998 #if INCLUDE_JVMCI
2999   if (EnableJVMCI) {
3000     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
3001     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
3002   }
3003 #endif
3004 }
3005 
3006 //------------------------------generate_handler_blob------
3007 //
3008 // Generate a special Compile2Runtime blob that saves all registers,
3009 // and setup oopmap.
3010 //
3011 SafepointBlob* SharedRuntime::generate_handler_blob(SharedStubId id, address call_ptr) {
3012   assert(StubRoutines::forward_exception_entry() != nullptr,
3013          "must be generated before");
3014   assert(is_polling_page_id(id), "expected a polling page stub id");
3015 
3016   ResourceMark rm;
3017   OopMapSet *oop_maps = new OopMapSet();
3018   OopMap* map;
3019 
3020   // Allocate space for the code.  Setup code generation tools.
3021   const char* name = SharedRuntime::stub_name(id);
3022   CodeBuffer buffer(name, 2548, 1024);
3023   MacroAssembler* masm = new MacroAssembler(&buffer);
3024 
3025   address start   = __ pc();
3026   address call_pc = nullptr;
3027   int frame_size_in_words;
3028   bool cause_return = (id == SharedStubId::polling_page_return_handler_id);
3029   bool save_wide_vectors = (id == SharedStubId::polling_page_vectors_safepoint_handler_id);
3030 
3031   // Make room for return address (or push it again)
3032   if (!cause_return) {
3033     __ push(rbx);
3034   }
3035 
3036   // Save registers, fpu state, and flags
3037   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3038 
3039   // The following is basically a call_VM.  However, we need the precise
3040   // address of the call in order to generate an oopmap. Hence, we do all the
3041   // work ourselves.
3042 
3043   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3044 
3045   // The return address must always be correct so that frame constructor never
3046   // sees an invalid pc.
3047 
3048   if (!cause_return) {
3049     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3050     // Additionally, rbx is a callee saved register and we can look at it later to determine
3051     // if someone changed the return address for us!
3052     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3053     __ movptr(Address(rbp, wordSize), rbx);
3054   }
3055 
3056   // Do the call
3057   __ mov(c_rarg0, r15_thread);
3058   __ call(RuntimeAddress(call_ptr));
3059 
3060   // Set an oopmap for the call site.  This oopmap will map all
3061   // oop-registers and debug-info registers as callee-saved.  This
3062   // will allow deoptimization at this safepoint to find all possible
3063   // debug-info recordings, as well as let GC find all oops.
3064 
3065   oop_maps->add_gc_map( __ pc() - start, map);
3066 
3067   Label noException;
3068 
3069   __ reset_last_Java_frame(false);
3070 
3071   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3072   __ jcc(Assembler::equal, noException);
3073 
3074   // Exception pending
3075 
3076   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3077 
3078   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3079 
3080   // No exception case
3081   __ bind(noException);
3082 
3083   Label no_adjust;
3084 #ifdef ASSERT
3085   Label bail;
3086 #endif
3087   if (!cause_return) {
3088     Label no_prefix, not_special, check_rex_prefix;
3089 
3090     // If our stashed return pc was modified by the runtime we avoid touching it
3091     __ cmpptr(rbx, Address(rbp, wordSize));
3092     __ jcc(Assembler::notEqual, no_adjust);
3093 
3094     // Skip over the poll instruction.
3095     // See NativeInstruction::is_safepoint_poll()
3096     // Possible encodings:
3097     //      85 00       test   %eax,(%rax)
3098     //      85 01       test   %eax,(%rcx)
3099     //      85 02       test   %eax,(%rdx)
3100     //      85 03       test   %eax,(%rbx)
3101     //      85 06       test   %eax,(%rsi)
3102     //      85 07       test   %eax,(%rdi)
3103     //
3104     //   41 85 00       test   %eax,(%r8)
3105     //   41 85 01       test   %eax,(%r9)
3106     //   41 85 02       test   %eax,(%r10)
3107     //   41 85 03       test   %eax,(%r11)
3108     //   41 85 06       test   %eax,(%r14)
3109     //   41 85 07       test   %eax,(%r15)
3110     //
3111     //      85 04 24    test   %eax,(%rsp)
3112     //   41 85 04 24    test   %eax,(%r12)
3113     //      85 45 00    test   %eax,0x0(%rbp)
3114     //   41 85 45 00    test   %eax,0x0(%r13)
3115     //
3116     // Notes:
3117     //  Format of legacy MAP0 test instruction:-
3118     //  [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32]
3119     //  o  For safepoint polling instruction "test %eax,(%rax)", encoding of first register
3120     //     operand and base register of memory operand is b/w [0-8), hence we do not require
3121     //     additional REX prefix where REX.B bit stores MSB bit of register encoding, which
3122     //     is why two bytes encoding is sufficient here.
3123     //  o  For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE
3124     //     register of memory operand is 1000, thus we need additional REX prefix in this case,
3125     //     there by adding additional byte to instruction encoding.
3126     //  o  In case BASE register is one of the 32 extended GPR registers available only on targets
3127     //     supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold
3128     //     most significant two bits of 5 bit register encoding.
3129 
3130     if (VM_Version::supports_apx_f()) {
3131       __ cmpb(Address(rbx, 0), Assembler::REX2);
3132       __ jccb(Assembler::notEqual, check_rex_prefix);
3133       __ addptr(rbx, 2);
3134       __ bind(check_rex_prefix);
3135     }
3136     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3137     __ jccb(Assembler::notEqual, no_prefix);
3138     __ addptr(rbx, 1);
3139     __ bind(no_prefix);
3140 #ifdef ASSERT
3141     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3142 #endif
3143     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3144     // r12/rsp 0x04
3145     // r13/rbp 0x05
3146     __ movzbq(rcx, Address(rbx, 1));
3147     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3148     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3149     __ cmpptr(rcx, 1);
3150     __ jccb(Assembler::above, not_special);
3151     __ addptr(rbx, 1);
3152     __ bind(not_special);
3153 #ifdef ASSERT
3154     // Verify the correct encoding of the poll we're about to skip.
3155     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3156     __ jcc(Assembler::notEqual, bail);
3157     // Mask out the modrm bits
3158     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3159     // rax encodes to 0, so if the bits are nonzero it's incorrect
3160     __ jcc(Assembler::notZero, bail);
3161 #endif
3162     // Adjust return pc forward to step over the safepoint poll instruction
3163     __ addptr(rbx, 2);
3164     __ movptr(Address(rbp, wordSize), rbx);
3165   }
3166 
3167   __ bind(no_adjust);
3168   // Normal exit, restore registers and exit.
3169   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3170   __ ret(0);
3171 
3172 #ifdef ASSERT
3173   __ bind(bail);
3174   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3175 #endif
3176 
3177   // Make sure all code is generated
3178   masm->flush();
3179 
3180   // Fill-out other meta info
3181   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3182 }
3183 
3184 //
3185 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3186 //
3187 // Generate a stub that calls into vm to find out the proper destination
3188 // of a java call. All the argument registers are live at this point
3189 // but since this is generic code we don't know what they are and the caller
3190 // must do any gc of the args.
3191 //
3192 RuntimeStub* SharedRuntime::generate_resolve_blob(SharedStubId id, address destination) {
3193   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3194   assert(is_resolve_id(id), "expected a resolve stub id");
3195 
3196   // allocate space for the code
3197   ResourceMark rm;
3198 
3199   const char* name = SharedRuntime::stub_name(id);
3200   CodeBuffer buffer(name, 1552, 512);
3201   MacroAssembler* masm = new MacroAssembler(&buffer);
3202 
3203   int frame_size_in_words;
3204 
3205   OopMapSet *oop_maps = new OopMapSet();
3206   OopMap* map = nullptr;
3207 
3208   int start = __ offset();
3209 
3210   // No need to save vector registers since they are caller-saved anyway.
3211   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3212 
3213   int frame_complete = __ offset();
3214 
3215   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3216 
3217   __ mov(c_rarg0, r15_thread);
3218 
3219   __ call(RuntimeAddress(destination));
3220 
3221 
3222   // Set an oopmap for the call site.
3223   // We need this not only for callee-saved registers, but also for volatile
3224   // registers that the compiler might be keeping live across a safepoint.
3225 
3226   oop_maps->add_gc_map( __ offset() - start, map);
3227 
3228   // rax contains the address we are going to jump to assuming no exception got installed
3229 
3230   // clear last_Java_sp
3231   __ reset_last_Java_frame(false);
3232   // check for pending exceptions
3233   Label pending;
3234   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3235   __ jcc(Assembler::notEqual, pending);
3236 
3237   // get the returned Method*
3238   __ get_vm_result_2(rbx, r15_thread);
3239   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3240 
3241   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3242 
3243   RegisterSaver::restore_live_registers(masm);
3244 
3245   // We are back to the original state on entry and ready to go.
3246 
3247   __ jmp(rax);
3248 
3249   // Pending exception after the safepoint
3250 
3251   __ bind(pending);
3252 
3253   RegisterSaver::restore_live_registers(masm);
3254 
3255   // exception pending => remove activation and forward to exception handler
3256 
3257   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
3258 
3259   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3260   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3261 
3262   // -------------
3263   // make sure all code is generated
3264   masm->flush();
3265 
3266   // return the  blob
3267   // frame_size_words or bytes??
3268   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3269 }
3270 
3271 // Continuation point for throwing of implicit exceptions that are
3272 // not handled in the current activation. Fabricates an exception
3273 // oop and initiates normal exception dispatching in this
3274 // frame. Since we need to preserve callee-saved values (currently
3275 // only for C2, but done for C1 as well) we need a callee-saved oop
3276 // map and therefore have to make these stubs into RuntimeStubs
3277 // rather than BufferBlobs.  If the compiler needs all registers to
3278 // be preserved between the fault point and the exception handler
3279 // then it must assume responsibility for that in
3280 // AbstractCompiler::continuation_for_implicit_null_exception or
3281 // continuation_for_implicit_division_by_zero_exception. All other
3282 // implicit exceptions (e.g., NullPointerException or
3283 // AbstractMethodError on entry) are either at call sites or
3284 // otherwise assume that stack unwinding will be initiated, so
3285 // caller saved registers were assumed volatile in the compiler.
3286 RuntimeStub* SharedRuntime::generate_throw_exception(SharedStubId id, address runtime_entry) {
3287   assert(is_throw_id(id), "expected a throw stub id");
3288 
3289   const char* name = SharedRuntime::stub_name(id);
3290 
3291   // Information about frame layout at time of blocking runtime call.
3292   // Note that we only have to preserve callee-saved registers since
3293   // the compilers are responsible for supplying a continuation point
3294   // if they expect all registers to be preserved.
3295   enum layout {
3296     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3297     rbp_off2,
3298     return_off,
3299     return_off2,
3300     framesize // inclusive of return address
3301   };
3302 
3303   int insts_size = 512;
3304   int locs_size  = 64;
3305 
3306   ResourceMark rm;
3307   const char* timer_msg = "SharedRuntime generate_throw_exception";
3308   TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3309 
3310   CodeBuffer code(name, insts_size, locs_size);
3311   OopMapSet* oop_maps  = new OopMapSet();
3312   MacroAssembler* masm = new MacroAssembler(&code);
3313 
3314   address start = __ pc();
3315 
3316   // This is an inlined and slightly modified version of call_VM
3317   // which has the ability to fetch the return PC out of
3318   // thread-local storage and also sets up last_Java_sp slightly
3319   // differently than the real call_VM
3320 
3321   __ enter(); // required for proper stackwalking of RuntimeStub frame
3322 
3323   assert(is_even(framesize/2), "sp not 16-byte aligned");
3324 
3325   // return address and rbp are already in place
3326   __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3327 
3328   int frame_complete = __ pc() - start;
3329 
3330   // Set up last_Java_sp and last_Java_fp
3331   address the_pc = __ pc();
3332   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3333   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3334 
3335   // Call runtime
3336   __ movptr(c_rarg0, r15_thread);
3337   BLOCK_COMMENT("call runtime_entry");
3338   __ call(RuntimeAddress(runtime_entry));
3339 
3340   // Generate oop map
3341   OopMap* map = new OopMap(framesize, 0);
3342 
3343   oop_maps->add_gc_map(the_pc - start, map);
3344 
3345   __ reset_last_Java_frame(true);
3346 
3347   __ leave(); // required for proper stackwalking of RuntimeStub frame
3348 
3349   // check for pending exceptions
3350 #ifdef ASSERT
3351   Label L;
3352   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3353   __ jcc(Assembler::notEqual, L);
3354   __ should_not_reach_here();
3355   __ bind(L);
3356 #endif // ASSERT
3357   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3358 
3359 
3360   // codeBlob framesize is in words (not VMRegImpl::slot_size)
3361   RuntimeStub* stub =
3362     RuntimeStub::new_runtime_stub(name,
3363                                   &code,
3364                                   frame_complete,
3365                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3366                                   oop_maps, false);
3367   return stub;
3368 }
3369 
3370 //------------------------------Montgomery multiplication------------------------
3371 //
3372 
3373 #ifndef _WINDOWS
3374 
3375 // Subtract 0:b from carry:a.  Return carry.
3376 static julong
3377 sub(julong a[], julong b[], julong carry, long len) {
3378   long long i = 0, cnt = len;
3379   julong tmp;
3380   asm volatile("clc; "
3381                "0: ; "
3382                "mov (%[b], %[i], 8), %[tmp]; "
3383                "sbb %[tmp], (%[a], %[i], 8); "
3384                "inc %[i]; dec %[cnt]; "
3385                "jne 0b; "
3386                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3387                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3388                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3389                : "memory");
3390   return tmp;
3391 }
3392 
3393 // Multiply (unsigned) Long A by Long B, accumulating the double-
3394 // length result into the accumulator formed of T0, T1, and T2.
3395 #define MACC(A, B, T0, T1, T2)                                  \
3396 do {                                                            \
3397   unsigned long hi, lo;                                         \
3398   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3399            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3400            : "r"(A), "a"(B) : "cc");                            \
3401  } while(0)
3402 
3403 // As above, but add twice the double-length result into the
3404 // accumulator.
3405 #define MACC2(A, B, T0, T1, T2)                                 \
3406 do {                                                            \
3407   unsigned long hi, lo;                                         \
3408   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3409            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3410            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3411            : "r"(A), "a"(B) : "cc");                            \
3412  } while(0)
3413 
3414 #else //_WINDOWS
3415 
3416 static julong
3417 sub(julong a[], julong b[], julong carry, long len) {
3418   long i;
3419   julong tmp;
3420   unsigned char c = 1;
3421   for (i = 0; i < len; i++) {
3422     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3423     a[i] = tmp;
3424   }
3425   c = _addcarry_u64(c, carry, ~0, &tmp);
3426   return tmp;
3427 }
3428 
3429 // Multiply (unsigned) Long A by Long B, accumulating the double-
3430 // length result into the accumulator formed of T0, T1, and T2.
3431 #define MACC(A, B, T0, T1, T2)                          \
3432 do {                                                    \
3433   julong hi, lo;                            \
3434   lo = _umul128(A, B, &hi);                             \
3435   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3436   c = _addcarry_u64(c, hi, T1, &T1);                    \
3437   _addcarry_u64(c, T2, 0, &T2);                         \
3438  } while(0)
3439 
3440 // As above, but add twice the double-length result into the
3441 // accumulator.
3442 #define MACC2(A, B, T0, T1, T2)                         \
3443 do {                                                    \
3444   julong hi, lo;                            \
3445   lo = _umul128(A, B, &hi);                             \
3446   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3447   c = _addcarry_u64(c, hi, T1, &T1);                    \
3448   _addcarry_u64(c, T2, 0, &T2);                         \
3449   c = _addcarry_u64(0, lo, T0, &T0);                    \
3450   c = _addcarry_u64(c, hi, T1, &T1);                    \
3451   _addcarry_u64(c, T2, 0, &T2);                         \
3452  } while(0)
3453 
3454 #endif //_WINDOWS
3455 
3456 // Fast Montgomery multiplication.  The derivation of the algorithm is
3457 // in  A Cryptographic Library for the Motorola DSP56000,
3458 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3459 
3460 static void NOINLINE
3461 montgomery_multiply(julong a[], julong b[], julong n[],
3462                     julong m[], julong inv, int len) {
3463   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3464   int i;
3465 
3466   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3467 
3468   for (i = 0; i < len; i++) {
3469     int j;
3470     for (j = 0; j < i; j++) {
3471       MACC(a[j], b[i-j], t0, t1, t2);
3472       MACC(m[j], n[i-j], t0, t1, t2);
3473     }
3474     MACC(a[i], b[0], t0, t1, t2);
3475     m[i] = t0 * inv;
3476     MACC(m[i], n[0], t0, t1, t2);
3477 
3478     assert(t0 == 0, "broken Montgomery multiply");
3479 
3480     t0 = t1; t1 = t2; t2 = 0;
3481   }
3482 
3483   for (i = len; i < 2*len; i++) {
3484     int j;
3485     for (j = i-len+1; j < len; j++) {
3486       MACC(a[j], b[i-j], t0, t1, t2);
3487       MACC(m[j], n[i-j], t0, t1, t2);
3488     }
3489     m[i-len] = t0;
3490     t0 = t1; t1 = t2; t2 = 0;
3491   }
3492 
3493   while (t0)
3494     t0 = sub(m, n, t0, len);
3495 }
3496 
3497 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3498 // multiplies so it should be up to 25% faster than Montgomery
3499 // multiplication.  However, its loop control is more complex and it
3500 // may actually run slower on some machines.
3501 
3502 static void NOINLINE
3503 montgomery_square(julong a[], julong n[],
3504                   julong m[], julong inv, int len) {
3505   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3506   int i;
3507 
3508   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3509 
3510   for (i = 0; i < len; i++) {
3511     int j;
3512     int end = (i+1)/2;
3513     for (j = 0; j < end; j++) {
3514       MACC2(a[j], a[i-j], t0, t1, t2);
3515       MACC(m[j], n[i-j], t0, t1, t2);
3516     }
3517     if ((i & 1) == 0) {
3518       MACC(a[j], a[j], t0, t1, t2);
3519     }
3520     for (; j < i; j++) {
3521       MACC(m[j], n[i-j], t0, t1, t2);
3522     }
3523     m[i] = t0 * inv;
3524     MACC(m[i], n[0], t0, t1, t2);
3525 
3526     assert(t0 == 0, "broken Montgomery square");
3527 
3528     t0 = t1; t1 = t2; t2 = 0;
3529   }
3530 
3531   for (i = len; i < 2*len; i++) {
3532     int start = i-len+1;
3533     int end = start + (len - start)/2;
3534     int j;
3535     for (j = start; j < end; j++) {
3536       MACC2(a[j], a[i-j], t0, t1, t2);
3537       MACC(m[j], n[i-j], t0, t1, t2);
3538     }
3539     if ((i & 1) == 0) {
3540       MACC(a[j], a[j], t0, t1, t2);
3541     }
3542     for (; j < len; j++) {
3543       MACC(m[j], n[i-j], t0, t1, t2);
3544     }
3545     m[i-len] = t0;
3546     t0 = t1; t1 = t2; t2 = 0;
3547   }
3548 
3549   while (t0)
3550     t0 = sub(m, n, t0, len);
3551 }
3552 
3553 // Swap words in a longword.
3554 static julong swap(julong x) {
3555   return (x << 32) | (x >> 32);
3556 }
3557 
3558 // Copy len longwords from s to d, word-swapping as we go.  The
3559 // destination array is reversed.
3560 static void reverse_words(julong *s, julong *d, int len) {
3561   d += len;
3562   while(len-- > 0) {
3563     d--;
3564     *d = swap(*s);
3565     s++;
3566   }
3567 }
3568 
3569 // The threshold at which squaring is advantageous was determined
3570 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3571 #define MONTGOMERY_SQUARING_THRESHOLD 64
3572 
3573 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3574                                         jint len, jlong inv,
3575                                         jint *m_ints) {
3576   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3577   int longwords = len/2;
3578 
3579   // Make very sure we don't use so much space that the stack might
3580   // overflow.  512 jints corresponds to an 16384-bit integer and
3581   // will use here a total of 8k bytes of stack space.
3582   int divisor = sizeof(julong) * 4;
3583   guarantee(longwords <= 8192 / divisor, "must be");
3584   int total_allocation = longwords * sizeof (julong) * 4;
3585   julong *scratch = (julong *)alloca(total_allocation);
3586 
3587   // Local scratch arrays
3588   julong
3589     *a = scratch + 0 * longwords,
3590     *b = scratch + 1 * longwords,
3591     *n = scratch + 2 * longwords,
3592     *m = scratch + 3 * longwords;
3593 
3594   reverse_words((julong *)a_ints, a, longwords);
3595   reverse_words((julong *)b_ints, b, longwords);
3596   reverse_words((julong *)n_ints, n, longwords);
3597 
3598   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3599 
3600   reverse_words(m, (julong *)m_ints, longwords);
3601 }
3602 
3603 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3604                                       jint len, jlong inv,
3605                                       jint *m_ints) {
3606   assert(len % 2 == 0, "array length in montgomery_square must be even");
3607   int longwords = len/2;
3608 
3609   // Make very sure we don't use so much space that the stack might
3610   // overflow.  512 jints corresponds to an 16384-bit integer and
3611   // will use here a total of 6k bytes of stack space.
3612   int divisor = sizeof(julong) * 3;
3613   guarantee(longwords <= (8192 / divisor), "must be");
3614   int total_allocation = longwords * sizeof (julong) * 3;
3615   julong *scratch = (julong *)alloca(total_allocation);
3616 
3617   // Local scratch arrays
3618   julong
3619     *a = scratch + 0 * longwords,
3620     *n = scratch + 1 * longwords,
3621     *m = scratch + 2 * longwords;
3622 
3623   reverse_words((julong *)a_ints, a, longwords);
3624   reverse_words((julong *)n_ints, n, longwords);
3625 
3626   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3627     ::montgomery_square(a, n, m, (julong)inv, longwords);
3628   } else {
3629     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3630   }
3631 
3632   reverse_words(m, (julong *)m_ints, longwords);
3633 }
3634 
3635 #if INCLUDE_JFR
3636 
3637 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
3638 // It returns a jobject handle to the event writer.
3639 // The handle is dereferenced and the return value is the event writer oop.
3640 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
3641   enum layout {
3642     rbp_off,
3643     rbpH_off,
3644     return_off,
3645     return_off2,
3646     framesize // inclusive of return address
3647   };
3648 
3649   const char* name = SharedRuntime::stub_name(SharedStubId::jfr_write_checkpoint_id);
3650   CodeBuffer code(name, 1024, 64);
3651   MacroAssembler* masm = new MacroAssembler(&code);
3652   address start = __ pc();
3653 
3654   __ enter();
3655   address the_pc = __ pc();
3656 
3657   int frame_complete = the_pc - start;
3658 
3659   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3660   __ movptr(c_rarg0, r15_thread);
3661   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
3662   __ reset_last_Java_frame(true);
3663 
3664   // rax is jobject handle result, unpack and process it through a barrier.
3665   __ resolve_global_jobject(rax, r15_thread, c_rarg0);
3666 
3667   __ leave();
3668   __ ret(0);
3669 
3670   OopMapSet* oop_maps = new OopMapSet();
3671   OopMap* map = new OopMap(framesize, 1);
3672   oop_maps->add_gc_map(frame_complete, map);
3673 
3674   RuntimeStub* stub =
3675     RuntimeStub::new_runtime_stub(name,
3676                                   &code,
3677                                   frame_complete,
3678                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3679                                   oop_maps,
3680                                   false);
3681   return stub;
3682 }
3683 
3684 // For c2: call to return a leased buffer.
3685 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
3686   enum layout {
3687     rbp_off,
3688     rbpH_off,
3689     return_off,
3690     return_off2,
3691     framesize // inclusive of return address
3692   };
3693 
3694   const char* name = SharedRuntime::stub_name(SharedStubId::jfr_return_lease_id);
3695   CodeBuffer code(name, 1024, 64);
3696   MacroAssembler* masm = new MacroAssembler(&code);
3697   address start = __ pc();
3698 
3699   __ enter();
3700   address the_pc = __ pc();
3701 
3702   int frame_complete = the_pc - start;
3703 
3704   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
3705   __ movptr(c_rarg0, r15_thread);
3706   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
3707   __ reset_last_Java_frame(true);
3708 
3709   __ leave();
3710   __ ret(0);
3711 
3712   OopMapSet* oop_maps = new OopMapSet();
3713   OopMap* map = new OopMap(framesize, 1);
3714   oop_maps->add_gc_map(frame_complete, map);
3715 
3716   RuntimeStub* stub =
3717     RuntimeStub::new_runtime_stub(name,
3718                                   &code,
3719                                   frame_complete,
3720                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3721                                   oop_maps,
3722                                   false);
3723   return stub;
3724 }
3725 
3726 #endif // INCLUDE_JFR
3727