1 /*
   2  * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef _WINDOWS
  26 #include "alloca.h"
  27 #endif
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "classfile/symbolTable.hpp"
  31 #include "code/aotCodeCache.hpp"
  32 #include "code/compiledIC.hpp"
  33 #include "code/debugInfoRec.hpp"
  34 #include "code/nativeInst.hpp"
  35 #include "code/vtableStubs.hpp"
  36 #include "compiler/oopMap.hpp"
  37 #include "gc/shared/collectedHeap.hpp"
  38 #include "gc/shared/gcLocker.hpp"
  39 #include "gc/shared/barrierSet.hpp"
  40 #include "gc/shared/barrierSetAssembler.hpp"
  41 #include "interpreter/interpreter.hpp"
  42 #include "logging/log.hpp"
  43 #include "memory/resourceArea.hpp"
  44 #include "memory/universe.hpp"
  45 #include "oops/klass.inline.hpp"
  46 #include "oops/method.inline.hpp"
  47 #include "prims/methodHandles.hpp"
  48 #include "runtime/continuation.hpp"
  49 #include "runtime/continuationEntry.inline.hpp"
  50 #include "runtime/globals.hpp"
  51 #include "runtime/jniHandles.hpp"
  52 #include "runtime/safepointMechanism.hpp"
  53 #include "runtime/sharedRuntime.hpp"
  54 #include "runtime/signature.hpp"
  55 #include "runtime/stubRoutines.hpp"
  56 #include "runtime/timerTrace.hpp"
  57 #include "runtime/vframeArray.hpp"
  58 #include "runtime/vm_version.hpp"
  59 #include "utilities/align.hpp"
  60 #include "utilities/checkedCast.hpp"
  61 #include "utilities/formatBuffer.hpp"
  62 #include "vmreg_x86.inline.hpp"
  63 #ifdef COMPILER1
  64 #include "c1/c1_Runtime1.hpp"
  65 #endif
  66 #ifdef COMPILER2
  67 #include "opto/runtime.hpp"
  68 #endif
  69 #if INCLUDE_JVMCI
  70 #include "jvmci/jvmciJavaClasses.hpp"
  71 #endif
  72 
  73 #define __ masm->
  74 
  75 #ifdef PRODUCT
  76 #define BLOCK_COMMENT(str) /* nothing */
  77 #else
  78 #define BLOCK_COMMENT(str) __ block_comment(str)
  79 #endif // PRODUCT
  80 
  81 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  82 
  83 class RegisterSaver {
  84   // Capture info about frame layout.  Layout offsets are in jint
  85   // units because compiler frame slots are jints.
  86 #define XSAVE_AREA_BEGIN 160
  87 #define XSAVE_AREA_YMM_BEGIN 576
  88 #define XSAVE_AREA_EGPRS 960
  89 #define XSAVE_AREA_OPMASK_BEGIN 1088
  90 #define XSAVE_AREA_ZMM_BEGIN 1152
  91 #define XSAVE_AREA_UPPERBANK 1664
  92 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  93 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  94 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
  95 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
  96 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
  97   enum layout {
  98     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt,    // fxsave save area
  99     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,  // offset in fxsave save area
 100     DEF_XMM_OFFS(0),
 101     DEF_XMM_OFFS(1),
 102     // 2..15 are implied in range usage
 103     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 104     DEF_YMM_OFFS(0),
 105     DEF_YMM_OFFS(1),
 106     r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
 107     r16H_off,
 108     r17_off, r17H_off,
 109     r18_off, r18H_off,
 110     r19_off, r19H_off,
 111     r20_off, r20H_off,
 112     r21_off, r21H_off,
 113     r22_off, r22H_off,
 114     r23_off, r23H_off,
 115     r24_off, r24H_off,
 116     r25_off, r25H_off,
 117     r26_off, r26H_off,
 118     r27_off, r27H_off,
 119     r28_off, r28H_off,
 120     r29_off, r29H_off,
 121     r30_off, r30H_off,
 122     r31_off, r31H_off,
 123     opmask_off   = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 124     DEF_OPMASK_OFFS(0),
 125     DEF_OPMASK_OFFS(1),
 126     // 2..7 are implied in range usage
 127     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 128     DEF_ZMM_OFFS(0),
 129     DEF_ZMM_OFFS(1),
 130     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 131     DEF_ZMM_UPPER_OFFS(16),
 132     DEF_ZMM_UPPER_OFFS(17),
 133     // 18..31 are implied in range usage
 134     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 135     fpu_stateH_end,
 136     r15_off, r15H_off,
 137     r14_off, r14H_off,
 138     r13_off, r13H_off,
 139     r12_off, r12H_off,
 140     r11_off, r11H_off,
 141     r10_off, r10H_off,
 142     r9_off,  r9H_off,
 143     r8_off,  r8H_off,
 144     rdi_off, rdiH_off,
 145     rsi_off, rsiH_off,
 146     ignore_off, ignoreH_off,  // extra copy of rbp
 147     rsp_off, rspH_off,
 148     rbx_off, rbxH_off,
 149     rdx_off, rdxH_off,
 150     rcx_off, rcxH_off,
 151     rax_off, raxH_off,
 152     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 153     align_off, alignH_off,
 154     flags_off, flagsH_off,
 155     // The frame sender code expects that rbp will be in the "natural" place and
 156     // will override any oopMap setting for it. We must therefore force the layout
 157     // so that it agrees with the frame sender code.
 158     rbp_off, rbpH_off,        // copy of rbp we will restore
 159     return_off, returnH_off,  // slot for return address
 160     reg_save_size             // size in compiler stack slots
 161   };
 162 
 163  public:
 164   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 165   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 166 
 167   // Offsets into the register save area
 168   // Used by deoptimization when it is managing result register
 169   // values on its own
 170 
 171   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 172   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 173   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 174   static int r15_offset_in_bytes(void)    { return BytesPerInt * r15_off; }
 175   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 176   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 177 
 178   // During deoptimization only the result registers need to be restored,
 179   // all the other values have already been extracted.
 180   static void restore_result_registers(MacroAssembler* masm);
 181 };
 182 
 183 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 184   int off = 0;
 185   int num_xmm_regs = XMMRegister::available_xmm_registers();
 186 #if COMPILER2_OR_JVMCI
 187   if (save_wide_vectors && UseAVX == 0) {
 188     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 189   }
 190   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 191 #else
 192   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 193 #endif
 194 
 195   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 196   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 197   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 198   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 199   // CodeBlob frame size is in words.
 200   int frame_size_in_words = frame_size_in_bytes / wordSize;
 201   *total_frame_words = frame_size_in_words;
 202 
 203   // Save registers, fpu state, and flags.
 204   // We assume caller has already pushed the return address onto the
 205   // stack, so rsp is 8-byte aligned here.
 206   // We push rpb twice in this sequence because we want the real rbp
 207   // to be under the return like a normal enter.
 208 
 209   __ enter();          // rsp becomes 16-byte aligned here
 210   __ pushf();
 211   // Make sure rsp stays 16-byte aligned
 212   __ subq(rsp, 8);
 213   // Push CPU state in multiple of 16 bytes
 214   __ save_legacy_gprs();
 215   __ push_FPU_state();
 216 
 217 
 218   // push cpu state handles this on EVEX enabled targets
 219   if (save_wide_vectors) {
 220     // Save upper half of YMM registers(0..15)
 221     int base_addr = XSAVE_AREA_YMM_BEGIN;
 222     for (int n = 0; n < 16; n++) {
 223       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 224     }
 225     if (VM_Version::supports_evex()) {
 226       // Save upper half of ZMM registers(0..15)
 227       base_addr = XSAVE_AREA_ZMM_BEGIN;
 228       for (int n = 0; n < 16; n++) {
 229         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 230       }
 231       // Save full ZMM registers(16..num_xmm_regs)
 232       base_addr = XSAVE_AREA_UPPERBANK;
 233       off = 0;
 234       int vector_len = Assembler::AVX_512bit;
 235       for (int n = 16; n < num_xmm_regs; n++) {
 236         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 237       }
 238 #if COMPILER2_OR_JVMCI
 239       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 240       off = 0;
 241       for(int n = 0; n < KRegister::number_of_registers; n++) {
 242         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 243       }
 244 #endif
 245     }
 246   } else {
 247     if (VM_Version::supports_evex()) {
 248       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 249       int base_addr = XSAVE_AREA_UPPERBANK;
 250       off = 0;
 251       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 252       for (int n = 16; n < num_xmm_regs; n++) {
 253         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 254       }
 255 #if COMPILER2_OR_JVMCI
 256       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 257       off = 0;
 258       for(int n = 0; n < KRegister::number_of_registers; n++) {
 259         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 260       }
 261 #endif
 262     }
 263   }
 264 
 265 #if COMPILER2_OR_JVMCI
 266   if (UseAPX) {
 267       int base_addr = XSAVE_AREA_EGPRS;
 268       off = 0;
 269       for (int n = 16; n < Register::number_of_registers; n++) {
 270         __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
 271       }
 272   }
 273 #endif
 274 
 275   __ vzeroupper();
 276   if (frame::arg_reg_save_area_bytes != 0) {
 277     // Allocate argument register save area
 278     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 279   }
 280 
 281   // Set an oopmap for the call site.  This oopmap will map all
 282   // oop-registers and debug-info registers as callee-saved.  This
 283   // will allow deoptimization at this safepoint to find all possible
 284   // debug-info recordings, as well as let GC find all oops.
 285 
 286   OopMapSet *oop_maps = new OopMapSet();
 287   OopMap* map = new OopMap(frame_size_in_slots, 0);
 288 
 289 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 290 
 291   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 292   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 293   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 294   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 295   // rbp location is known implicitly by the frame sender code, needs no oopmap
 296   // and the location where rbp was saved by is ignored
 297   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 298   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 299   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 300   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 301   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 302   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 303   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 304   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 305   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 306   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 307 
 308   if (UseAPX) {
 309     map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
 310     map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
 311     map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
 312     map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
 313     map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
 314     map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
 315     map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
 316     map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
 317     map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
 318     map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
 319     map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
 320     map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
 321     map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
 322     map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
 323     map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
 324     map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
 325   }
 326   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 327   // on EVEX enabled targets, we get it included in the xsave area
 328   off = xmm0_off;
 329   int delta = xmm1_off - off;
 330   for (int n = 0; n < 16; n++) {
 331     XMMRegister xmm_name = as_XMMRegister(n);
 332     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 333     off += delta;
 334   }
 335   if (UseAVX > 2) {
 336     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 337     off = zmm16_off;
 338     delta = zmm17_off - off;
 339     for (int n = 16; n < num_xmm_regs; n++) {
 340       XMMRegister zmm_name = as_XMMRegister(n);
 341       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 342       off += delta;
 343     }
 344   }
 345 
 346 #if COMPILER2_OR_JVMCI
 347   if (save_wide_vectors) {
 348     // Save upper half of YMM registers(0..15)
 349     off = ymm0_off;
 350     delta = ymm1_off - ymm0_off;
 351     for (int n = 0; n < 16; n++) {
 352       XMMRegister ymm_name = as_XMMRegister(n);
 353       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 354       off += delta;
 355     }
 356     if (VM_Version::supports_evex()) {
 357       // Save upper half of ZMM registers(0..15)
 358       off = zmm0_off;
 359       delta = zmm1_off - zmm0_off;
 360       for (int n = 0; n < 16; n++) {
 361         XMMRegister zmm_name = as_XMMRegister(n);
 362         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 363         off += delta;
 364       }
 365     }
 366   }
 367 #endif // COMPILER2_OR_JVMCI
 368 
 369   // %%% These should all be a waste but we'll keep things as they were for now
 370   if (true) {
 371     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 372     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 373     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 374     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 375     // rbp location is known implicitly by the frame sender code, needs no oopmap
 376     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 377     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 378     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 379     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 380     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 381     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 382     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 383     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 384     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 385     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 386     if (UseAPX) {
 387       map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
 388       map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
 389       map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
 390       map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
 391       map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
 392       map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
 393       map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
 394       map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
 395       map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
 396       map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
 397       map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
 398       map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
 399       map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
 400       map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
 401       map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
 402       map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
 403     }
 404     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 405     // on EVEX enabled targets, we get it included in the xsave area
 406     off = xmm0H_off;
 407     delta = xmm1H_off - off;
 408     for (int n = 0; n < 16; n++) {
 409       XMMRegister xmm_name = as_XMMRegister(n);
 410       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 411       off += delta;
 412     }
 413     if (UseAVX > 2) {
 414       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 415       off = zmm16H_off;
 416       delta = zmm17H_off - off;
 417       for (int n = 16; n < num_xmm_regs; n++) {
 418         XMMRegister zmm_name = as_XMMRegister(n);
 419         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 420         off += delta;
 421       }
 422     }
 423   }
 424 
 425   return map;
 426 }
 427 
 428 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 429   int num_xmm_regs = XMMRegister::available_xmm_registers();
 430   if (frame::arg_reg_save_area_bytes != 0) {
 431     // Pop arg register save area
 432     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 433   }
 434 
 435 #if COMPILER2_OR_JVMCI
 436   if (restore_wide_vectors) {
 437     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 438     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 439   }
 440 #else
 441   assert(!restore_wide_vectors, "vectors are generated only by C2");
 442 #endif
 443 
 444   __ vzeroupper();
 445 
 446   // On EVEX enabled targets everything is handled in pop fpu state
 447   if (restore_wide_vectors) {
 448     // Restore upper half of YMM registers (0..15)
 449     int base_addr = XSAVE_AREA_YMM_BEGIN;
 450     for (int n = 0; n < 16; n++) {
 451       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 452     }
 453     if (VM_Version::supports_evex()) {
 454       // Restore upper half of ZMM registers (0..15)
 455       base_addr = XSAVE_AREA_ZMM_BEGIN;
 456       for (int n = 0; n < 16; n++) {
 457         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 458       }
 459       // Restore full ZMM registers(16..num_xmm_regs)
 460       base_addr = XSAVE_AREA_UPPERBANK;
 461       int vector_len = Assembler::AVX_512bit;
 462       int off = 0;
 463       for (int n = 16; n < num_xmm_regs; n++) {
 464         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 465       }
 466 #if COMPILER2_OR_JVMCI
 467       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 468       off = 0;
 469       for (int n = 0; n < KRegister::number_of_registers; n++) {
 470         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 471       }
 472 #endif
 473     }
 474   } else {
 475     if (VM_Version::supports_evex()) {
 476       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 477       int base_addr = XSAVE_AREA_UPPERBANK;
 478       int off = 0;
 479       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 480       for (int n = 16; n < num_xmm_regs; n++) {
 481         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 482       }
 483 #if COMPILER2_OR_JVMCI
 484       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 485       off = 0;
 486       for (int n = 0; n < KRegister::number_of_registers; n++) {
 487         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 488       }
 489 #endif
 490     }
 491   }
 492 
 493 #if COMPILER2_OR_JVMCI
 494   if (UseAPX) {
 495     int base_addr = XSAVE_AREA_EGPRS;
 496     int off = 0;
 497     for (int n = 16; n < Register::number_of_registers; n++) {
 498       __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
 499     }
 500   }
 501 #endif
 502 
 503   // Recover CPU state
 504   __ pop_FPU_state();
 505   __ restore_legacy_gprs();
 506   __ addq(rsp, 8);
 507   __ popf();
 508   // Get the rbp described implicitly by the calling convention (no oopMap)
 509   __ pop(rbp);
 510 }
 511 
 512 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 513 
 514   // Just restore result register. Only used by deoptimization. By
 515   // now any callee save register that needs to be restored to a c2
 516   // caller of the deoptee has been extracted into the vframeArray
 517   // and will be stuffed into the c2i adapter we create for later
 518   // restoration so only result registers need to be restored here.
 519 
 520   // Restore fp result register
 521   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 522   // Restore integer result register
 523   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 524   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 525 
 526   // Pop all of the register save are off the stack except the return address
 527   __ addptr(rsp, return_offset_in_bytes());
 528 }
 529 
 530 // Is vector's size (in bytes) bigger than a size saved by default?
 531 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 532 bool SharedRuntime::is_wide_vector(int size) {
 533   return size > 16;
 534 }
 535 
 536 // ---------------------------------------------------------------------------
 537 // Read the array of BasicTypes from a signature, and compute where the
 538 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 539 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 540 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 541 // as framesizes are fixed.
 542 // VMRegImpl::stack0 refers to the first slot 0(sp).
 543 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 544 // Register up to Register::number_of_registers are the 64-bit
 545 // integer registers.
 546 
 547 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 548 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 549 // units regardless of build. Of course for i486 there is no 64 bit build
 550 
 551 // The Java calling convention is a "shifted" version of the C ABI.
 552 // By skipping the first C ABI register we can call non-static jni methods
 553 // with small numbers of arguments without having to shuffle the arguments
 554 // at all. Since we control the java ABI we ought to at least get some
 555 // advantage out of it.
 556 
 557 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 558                                            VMRegPair *regs,
 559                                            int total_args_passed) {
 560 
 561   // Create the mapping between argument positions and
 562   // registers.
 563   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 564     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 565   };
 566   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 567     j_farg0, j_farg1, j_farg2, j_farg3,
 568     j_farg4, j_farg5, j_farg6, j_farg7
 569   };
 570 
 571 
 572   uint int_args = 0;
 573   uint fp_args = 0;
 574   uint stk_args = 0;
 575 
 576   for (int i = 0; i < total_args_passed; i++) {
 577     switch (sig_bt[i]) {
 578     case T_BOOLEAN:
 579     case T_CHAR:
 580     case T_BYTE:
 581     case T_SHORT:
 582     case T_INT:
 583       if (int_args < Argument::n_int_register_parameters_j) {
 584         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 585       } else {
 586         stk_args = align_up(stk_args, 2);
 587         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 588         stk_args += 1;
 589       }
 590       break;
 591     case T_VOID:
 592       // halves of T_LONG or T_DOUBLE
 593       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 594       regs[i].set_bad();
 595       break;
 596     case T_LONG:
 597       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 598       // fall through
 599     case T_OBJECT:
 600     case T_ARRAY:
 601     case T_ADDRESS:
 602       if (int_args < Argument::n_int_register_parameters_j) {
 603         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 604       } else {
 605         stk_args = align_up(stk_args, 2);
 606         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 607         stk_args += 2;
 608       }
 609       break;
 610     case T_FLOAT:
 611       if (fp_args < Argument::n_float_register_parameters_j) {
 612         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 613       } else {
 614         stk_args = align_up(stk_args, 2);
 615         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 616         stk_args += 1;
 617       }
 618       break;
 619     case T_DOUBLE:
 620       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 621       if (fp_args < Argument::n_float_register_parameters_j) {
 622         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 623       } else {
 624         stk_args = align_up(stk_args, 2);
 625         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 626         stk_args += 2;
 627       }
 628       break;
 629     default:
 630       ShouldNotReachHere();
 631       break;
 632     }
 633   }
 634 
 635   return stk_args;
 636 }
 637 
 638 // Same as java_calling_convention() but for multiple return
 639 // values. There's no way to store them on the stack so if we don't
 640 // have enough registers, multiple values can't be returned.
 641 const uint SharedRuntime::java_return_convention_max_int = Argument::n_int_register_parameters_j+1;
 642 const uint SharedRuntime::java_return_convention_max_float = Argument::n_float_register_parameters_j;
 643 int SharedRuntime::java_return_convention(const BasicType *sig_bt,
 644                                           VMRegPair *regs,
 645                                           int total_args_passed) {
 646   // Create the mapping between argument positions and
 647   // registers.
 648   static const Register INT_ArgReg[java_return_convention_max_int] = {
 649     rax, j_rarg5, j_rarg4, j_rarg3, j_rarg2, j_rarg1, j_rarg0
 650   };
 651   static const XMMRegister FP_ArgReg[java_return_convention_max_float] = {
 652     j_farg0, j_farg1, j_farg2, j_farg3,
 653     j_farg4, j_farg5, j_farg6, j_farg7
 654   };
 655 
 656 
 657   uint int_args = 0;
 658   uint fp_args = 0;
 659 
 660   for (int i = 0; i < total_args_passed; i++) {
 661     switch (sig_bt[i]) {
 662     case T_BOOLEAN:
 663     case T_CHAR:
 664     case T_BYTE:
 665     case T_SHORT:
 666     case T_INT:
 667       if (int_args < Argument::n_int_register_parameters_j+1) {
 668         regs[i].set1(INT_ArgReg[int_args]->as_VMReg());
 669         int_args++;
 670       } else {
 671         return -1;
 672       }
 673       break;
 674     case T_VOID:
 675       // halves of T_LONG or T_DOUBLE
 676       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 677       regs[i].set_bad();
 678       break;
 679     case T_LONG:
 680       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 681       // fall through
 682     case T_OBJECT:
 683     case T_ARRAY:
 684     case T_ADDRESS:
 685     case T_METADATA:
 686       if (int_args < Argument::n_int_register_parameters_j+1) {
 687         regs[i].set2(INT_ArgReg[int_args]->as_VMReg());
 688         int_args++;
 689       } else {
 690         return -1;
 691       }
 692       break;
 693     case T_FLOAT:
 694       if (fp_args < Argument::n_float_register_parameters_j) {
 695         regs[i].set1(FP_ArgReg[fp_args]->as_VMReg());
 696         fp_args++;
 697       } else {
 698         return -1;
 699       }
 700       break;
 701     case T_DOUBLE:
 702       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 703       if (fp_args < Argument::n_float_register_parameters_j) {
 704         regs[i].set2(FP_ArgReg[fp_args]->as_VMReg());
 705         fp_args++;
 706       } else {
 707         return -1;
 708       }
 709       break;
 710     default:
 711       ShouldNotReachHere();
 712       break;
 713     }
 714   }
 715 
 716   return int_args + fp_args;
 717 }
 718 
 719 // Patch the callers callsite with entry to compiled code if it exists.
 720 static void patch_callers_callsite(MacroAssembler *masm) {
 721   Label L;
 722   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 723   __ jcc(Assembler::equal, L);
 724 
 725   // Save the current stack pointer
 726   __ mov(r13, rsp);
 727   // Schedule the branch target address early.
 728   // Call into the VM to patch the caller, then jump to compiled callee
 729   // rax isn't live so capture return address while we easily can
 730   __ movptr(rax, Address(rsp, 0));
 731 
 732   // align stack so push_CPU_state doesn't fault
 733   __ andptr(rsp, -(StackAlignmentInBytes));
 734   __ push_CPU_state();
 735   __ vzeroupper();
 736   // VM needs caller's callsite
 737   // VM needs target method
 738   // This needs to be a long call since we will relocate this adapter to
 739   // the codeBuffer and it may not reach
 740 
 741   // Allocate argument register save area
 742   if (frame::arg_reg_save_area_bytes != 0) {
 743     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 744   }
 745   __ mov(c_rarg0, rbx);
 746   __ mov(c_rarg1, rax);
 747   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 748 
 749   // De-allocate argument register save area
 750   if (frame::arg_reg_save_area_bytes != 0) {
 751     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 752   }
 753 
 754   __ vzeroupper();
 755   __ pop_CPU_state();
 756   // restore sp
 757   __ mov(rsp, r13);
 758   __ bind(L);
 759 }
 760 
 761 // For each inline type argument, sig includes the list of fields of
 762 // the inline type. This utility function computes the number of
 763 // arguments for the call if inline types are passed by reference (the
 764 // calling convention the interpreter expects).
 765 static int compute_total_args_passed_int(const GrowableArray<SigEntry>* sig_extended) {
 766   int total_args_passed = 0;
 767   if (InlineTypePassFieldsAsArgs) {
 768     for (int i = 0; i < sig_extended->length(); i++) {
 769       BasicType bt = sig_extended->at(i)._bt;
 770       if (bt == T_METADATA) {
 771         // In sig_extended, an inline type argument starts with:
 772         // T_METADATA, followed by the types of the fields of the
 773         // inline type and T_VOID to mark the end of the value
 774         // type. Inline types are flattened so, for instance, in the
 775         // case of an inline type with an int field and an inline type
 776         // field that itself has 2 fields, an int and a long:
 777         // T_METADATA T_INT T_METADATA T_INT T_LONG T_VOID (second
 778         // slot for the T_LONG) T_VOID (inner inline type) T_VOID
 779         // (outer inline type)
 780         total_args_passed++;
 781         int vt = 1;
 782         do {
 783           i++;
 784           BasicType bt = sig_extended->at(i)._bt;
 785           BasicType prev_bt = sig_extended->at(i-1)._bt;
 786           if (bt == T_METADATA) {
 787             vt++;
 788           } else if (bt == T_VOID &&
 789                      prev_bt != T_LONG &&
 790                      prev_bt != T_DOUBLE) {
 791             vt--;
 792           }
 793         } while (vt != 0);
 794       } else {
 795         total_args_passed++;
 796       }
 797     }
 798   } else {
 799     total_args_passed = sig_extended->length();
 800   }
 801   return total_args_passed;
 802 }
 803 
 804 
 805 static void gen_c2i_adapter_helper(MacroAssembler* masm,
 806                                    BasicType bt,
 807                                    BasicType prev_bt,
 808                                    size_t size_in_bytes,
 809                                    const VMRegPair& reg_pair,
 810                                    const Address& to,
 811                                    int extraspace,
 812                                    bool is_oop) {
 813   if (bt == T_VOID) {
 814     assert(prev_bt == T_LONG || prev_bt == T_DOUBLE, "missing half");
 815     return;
 816   }
 817 
 818   // Say 4 args:
 819   // i   st_off
 820   // 0   32 T_LONG
 821   // 1   24 T_VOID
 822   // 2   16 T_OBJECT
 823   // 3    8 T_BOOL
 824   // -    0 return address
 825   //
 826   // However to make thing extra confusing. Because we can fit a long/double in
 827   // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 828   // leaves one slot empty and only stores to a single slot. In this case the
 829   // slot that is occupied is the T_VOID slot. See I said it was confusing.
 830 
 831   bool wide = (size_in_bytes == wordSize);
 832   VMReg r_1 = reg_pair.first();
 833   VMReg r_2 = reg_pair.second();
 834   assert(r_2->is_valid() == wide, "invalid size");
 835   if (!r_1->is_valid()) {
 836     assert(!r_2->is_valid(), "must be invalid");
 837     return;
 838   }
 839 
 840   if (!r_1->is_XMMRegister()) {
 841     Register val = rax;
 842     if (r_1->is_stack()) {
 843       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 844       __ load_sized_value(val, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false);
 845     } else {
 846       val = r_1->as_Register();
 847     }
 848     assert_different_registers(to.base(), val, rscratch1);
 849     if (is_oop) {
 850       __ push(r13);
 851       __ push(rbx);
 852       // store_heap_oop transitively calls oop_store_at which corrupts to.base(). We need to keep it valid.
 853       __ push(to.base());
 854       __ store_heap_oop(to, val, rscratch1, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
 855       __ pop(to.base());
 856       __ pop(rbx);
 857       __ pop(r13);
 858     } else {
 859       __ store_sized_value(to, val, size_in_bytes);
 860     }
 861   } else {
 862     if (wide) {
 863       __ movdbl(to, r_1->as_XMMRegister());
 864     } else {
 865       __ movflt(to, r_1->as_XMMRegister());
 866     }
 867   }
 868 }
 869 
 870 static void gen_c2i_adapter(MacroAssembler *masm,
 871                             const GrowableArray<SigEntry>* sig_extended,
 872                             const VMRegPair *regs,
 873                             bool requires_clinit_barrier,
 874                             address& c2i_no_clinit_check_entry,
 875                             Label& skip_fixup,
 876                             address start,
 877                             OopMapSet* oop_maps,
 878                             int& frame_complete,
 879                             int& frame_size_in_words,
 880                             bool alloc_inline_receiver) {
 881   if (requires_clinit_barrier) {
 882     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
 883     Label L_skip_barrier;
 884     Register method = rbx;
 885 
 886     { // Bypass the barrier for non-static methods
 887       Register flags = rscratch1;
 888       __ load_unsigned_short(flags, Address(method, Method::access_flags_offset()));
 889       __ testl(flags, JVM_ACC_STATIC);
 890       __ jcc(Assembler::zero, L_skip_barrier); // non-static
 891     }
 892 
 893     Register klass = rscratch1;
 894     __ load_method_holder(klass, method);
 895     __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
 896 
 897     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
 898 
 899     __ bind(L_skip_barrier);
 900     c2i_no_clinit_check_entry = __ pc();
 901   }
 902 
 903   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 904   bs->c2i_entry_barrier(masm);
 905 
 906   // Before we get into the guts of the C2I adapter, see if we should be here
 907   // at all.  We've come from compiled code and are attempting to jump to the
 908   // interpreter, which means the caller made a static call to get here
 909   // (vcalls always get a compiled target if there is one).  Check for a
 910   // compiled target.  If there is one, we need to patch the caller's call.
 911   patch_callers_callsite(masm);
 912 
 913   __ bind(skip_fixup);
 914 
 915   if (InlineTypePassFieldsAsArgs) {
 916     // Is there an inline type argument?
 917     bool has_inline_argument = false;
 918     for (int i = 0; i < sig_extended->length() && !has_inline_argument; i++) {
 919       has_inline_argument = (sig_extended->at(i)._bt == T_METADATA);
 920     }
 921     if (has_inline_argument) {
 922       // There is at least an inline type argument: we're coming from
 923       // compiled code so we have no buffers to back the inline types.
 924       // Allocate the buffers here with a runtime call.
 925       OopMap* map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
 926 
 927       frame_complete = __ offset();
 928 
 929       __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
 930 
 931       __ mov(c_rarg0, r15_thread);
 932       __ mov(c_rarg1, rbx);
 933       __ mov64(c_rarg2, (int64_t)alloc_inline_receiver);
 934       __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::allocate_inline_types)));
 935 
 936       oop_maps->add_gc_map((int)(__ pc() - start), map);
 937       __ reset_last_Java_frame(false);
 938 
 939       RegisterSaver::restore_live_registers(masm);
 940 
 941       Label no_exception;
 942       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
 943       __ jcc(Assembler::equal, no_exception);
 944 
 945       __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
 946       __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 947       __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
 948 
 949       __ bind(no_exception);
 950 
 951       // We get an array of objects from the runtime call
 952       __ get_vm_result_oop(rscratch2); // Use rscratch2 (r11) as temporary because rscratch1 (r10) is trashed by movptr()
 953       __ get_vm_result_metadata(rbx); // TODO: required to keep the callee Method live?
 954     }
 955   }
 956 
 957   // Since all args are passed on the stack, total_args_passed *
 958   // Interpreter::stackElementSize is the space we need.
 959   int total_args_passed = compute_total_args_passed_int(sig_extended);
 960   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 961 
 962   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 963 
 964   // stack is aligned, keep it that way
 965   // This is not currently needed or enforced by the interpreter, but
 966   // we might as well conform to the ABI.
 967   extraspace = align_up(extraspace, 2*wordSize);
 968 
 969   // set senderSP value
 970   __ lea(r13, Address(rsp, wordSize));
 971 
 972 #ifdef ASSERT
 973   __ check_stack_alignment(r13, "sender stack not aligned");
 974 #endif
 975   if (extraspace > 0) {
 976     // Pop the return address
 977     __ pop(rax);
 978 
 979     __ subptr(rsp, extraspace);
 980 
 981     // Push the return address
 982     __ push(rax);
 983 
 984     // Account for the return address location since we store it first rather
 985     // than hold it in a register across all the shuffling
 986     extraspace += wordSize;
 987   }
 988 
 989 #ifdef ASSERT
 990   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 991 #endif
 992 
 993   // Now write the args into the outgoing interpreter space
 994 
 995   // next_arg_comp is the next argument from the compiler point of
 996   // view (inline type fields are passed in registers/on the stack). In
 997   // sig_extended, an inline type argument starts with: T_METADATA,
 998   // followed by the types of the fields of the inline type and T_VOID
 999   // to mark the end of the inline type. ignored counts the number of
1000   // T_METADATA/T_VOID. next_vt_arg is the next inline type argument:
1001   // used to get the buffer for that argument from the pool of buffers
1002   // we allocated above and want to pass to the
1003   // interpreter. next_arg_int is the next argument from the
1004   // interpreter point of view (inline types are passed by reference).
1005   for (int next_arg_comp = 0, ignored = 0, next_vt_arg = 0, next_arg_int = 0;
1006        next_arg_comp < sig_extended->length(); next_arg_comp++) {
1007     assert(ignored <= next_arg_comp, "shouldn't skip over more slots than there are arguments");
1008     assert(next_arg_int <= total_args_passed, "more arguments for the interpreter than expected?");
1009     BasicType bt = sig_extended->at(next_arg_comp)._bt;
1010     int st_off = (total_args_passed - next_arg_int) * Interpreter::stackElementSize;
1011     if (!InlineTypePassFieldsAsArgs || bt != T_METADATA) {
1012       int next_off = st_off - Interpreter::stackElementSize;
1013       const int offset = (bt == T_LONG || bt == T_DOUBLE) ? next_off : st_off;
1014       const VMRegPair reg_pair = regs[next_arg_comp-ignored];
1015       size_t size_in_bytes = reg_pair.second()->is_valid() ? 8 : 4;
1016       gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
1017                              size_in_bytes, reg_pair, Address(rsp, offset), extraspace, false);
1018       next_arg_int++;
1019 #ifdef ASSERT
1020       if (bt == T_LONG || bt == T_DOUBLE) {
1021         // Overwrite the unused slot with known junk
1022         __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
1023         __ movptr(Address(rsp, st_off), rax);
1024       }
1025 #endif /* ASSERT */
1026     } else {
1027       ignored++;
1028       next_arg_int++;
1029       int vt = 1;
1030       // write fields we get from compiled code in registers/stack
1031       // slots to the buffer: we know we are done with that inline type
1032       // argument when we hit the T_VOID that acts as an end of inline
1033       // type delimiter for this inline type. Inline types are flattened
1034       // so we might encounter embedded inline types. Each entry in
1035       // sig_extended contains a field offset in the buffer.
1036       Label L_null;
1037       Label not_null_buffer;
1038       do {
1039         next_arg_comp++;
1040         BasicType bt = sig_extended->at(next_arg_comp)._bt;
1041         BasicType prev_bt = sig_extended->at(next_arg_comp-1)._bt;
1042         if (bt == T_METADATA) {
1043           vt++;
1044           ignored++;
1045         } else if (bt == T_VOID &&
1046                    prev_bt != T_LONG &&
1047                    prev_bt != T_DOUBLE) {
1048           vt--;
1049           ignored++;
1050         } else if (sig_extended->at(next_arg_comp)._vt_oop) {
1051           // buffer argument: use if non null
1052           VMReg buffer = regs[next_arg_comp-ignored].first();
1053           if (buffer->is_stack()) {
1054             int ld_off = buffer->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
1055             __ movptr(r14, Address(rsp, ld_off));
1056           } else {
1057             __ movptr(r14, buffer->as_Register());
1058           }
1059           __ testptr(r14, r14);
1060           __ jcc(Assembler::notEqual, not_null_buffer);
1061           // otherwise get the buffer from the just allocated pool of buffers
1062           int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + next_vt_arg * type2aelembytes(T_OBJECT);
1063           __ load_heap_oop(r14, Address(rscratch2, index));
1064           next_vt_arg++;
1065         } else {
1066           int off = sig_extended->at(next_arg_comp)._offset;
1067           if (off == -1) {
1068             // Nullable inline type argument, emit null check
1069             VMReg reg = regs[next_arg_comp-ignored].first();
1070             Label L_notNull;
1071             if (reg->is_stack()) {
1072               int ld_off = reg->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
1073               __ testb(Address(rsp, ld_off), 1);
1074             } else {
1075               __ testb(reg->as_Register(), 1);
1076             }
1077             __ jcc(Assembler::notZero, L_notNull);
1078             __ movptr(Address(rsp, st_off), 0);
1079             __ jmp(L_null);
1080             __ bind(L_notNull);
1081             continue;
1082           }
1083           assert(off > 0, "offset in object should be positive");
1084           size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize;
1085           bool is_oop = is_reference_type(bt);
1086           gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
1087                                  size_in_bytes, regs[next_arg_comp-ignored], Address(r14, off), extraspace, is_oop);
1088         }
1089       } while (vt != 0);
1090       // pass the buffer to the interpreter
1091       __ bind(not_null_buffer);
1092       __ movptr(Address(rsp, st_off), r14);
1093       __ bind(L_null);
1094     }
1095   }
1096 
1097   // Schedule the branch target address early.
1098   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
1099   __ jmp(rcx);
1100 }
1101 
1102 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
1103                                     int comp_args_on_stack,
1104                                     const GrowableArray<SigEntry>* sig,
1105                                     const VMRegPair *regs) {
1106 
1107   // Note: r13 contains the senderSP on entry. We must preserve it since
1108   // we may do a i2c -> c2i transition if we lose a race where compiled
1109   // code goes non-entrant while we get args ready.
1110   // In addition we use r13 to locate all the interpreter args as
1111   // we must align the stack to 16 bytes on an i2c entry else we
1112   // lose alignment we expect in all compiled code and register
1113   // save code can segv when fxsave instructions find improperly
1114   // aligned stack pointer.
1115 
1116   // Adapters can be frameless because they do not require the caller
1117   // to perform additional cleanup work, such as correcting the stack pointer.
1118   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
1119   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
1120   // even if a callee has modified the stack pointer.
1121   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
1122   // routinely repairs its caller's stack pointer (from sender_sp, which is set
1123   // up via the senderSP register).
1124   // In other words, if *either* the caller or callee is interpreted, we can
1125   // get the stack pointer repaired after a call.
1126   // This is why c2i and i2c adapters cannot be indefinitely composed.
1127   // In particular, if a c2i adapter were to somehow call an i2c adapter,
1128   // both caller and callee would be compiled methods, and neither would
1129   // clean up the stack pointer changes performed by the two adapters.
1130   // If this happens, control eventually transfers back to the compiled
1131   // caller, but with an uncorrected stack, causing delayed havoc.
1132 
1133   // Must preserve original SP for loading incoming arguments because
1134   // we need to align the outgoing SP for compiled code.
1135   __ movptr(r11, rsp);
1136 
1137   // Pick up the return address
1138   __ pop(rax);
1139 
1140   // Convert 4-byte c2 stack slots to words.
1141   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
1142 
1143   if (comp_args_on_stack) {
1144     __ subptr(rsp, comp_words_on_stack * wordSize);
1145   }
1146 
1147   // Ensure compiled code always sees stack at proper alignment
1148   __ andptr(rsp, -16);
1149 
1150   // push the return address and misalign the stack that youngest frame always sees
1151   // as far as the placement of the call instruction
1152   __ push(rax);
1153 
1154   // Put saved SP in another register
1155   const Register saved_sp = rax;
1156   __ movptr(saved_sp, r11);
1157 
1158   // Will jump to the compiled code just as if compiled code was doing it.
1159   // Pre-load the register-jump target early, to schedule it better.
1160   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_inline_offset())));
1161 
1162 #if INCLUDE_JVMCI
1163   if (EnableJVMCI) {
1164     // check if this call should be routed towards a specific entry point
1165     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1166     Label no_alternative_target;
1167     __ jcc(Assembler::equal, no_alternative_target);
1168     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
1169     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1170     __ bind(no_alternative_target);
1171   }
1172 #endif // INCLUDE_JVMCI
1173 
1174   int total_args_passed = sig->length();
1175 
1176   // Now generate the shuffle code.  Pick up all register args and move the
1177   // rest through the floating point stack top.
1178   for (int i = 0; i < total_args_passed; i++) {
1179     BasicType bt = sig->at(i)._bt;
1180     if (bt == T_VOID) {
1181       // Longs and doubles are passed in native word order, but misaligned
1182       // in the 32-bit build.
1183       BasicType prev_bt = (i > 0) ? sig->at(i-1)._bt : T_ILLEGAL;
1184       assert(i > 0 && (prev_bt == T_LONG || prev_bt == T_DOUBLE), "missing half");
1185       continue;
1186     }
1187 
1188     // Pick up 0, 1 or 2 words from SP+offset.
1189 
1190     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
1191             "scrambled load targets?");
1192     // Load in argument order going down.
1193     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
1194     // Point to interpreter value (vs. tag)
1195     int next_off = ld_off - Interpreter::stackElementSize;
1196     //
1197     //
1198     //
1199     VMReg r_1 = regs[i].first();
1200     VMReg r_2 = regs[i].second();
1201     if (!r_1->is_valid()) {
1202       assert(!r_2->is_valid(), "");
1203       continue;
1204     }
1205     if (r_1->is_stack()) {
1206       // Convert stack slot to an SP offset (+ wordSize to account for return address )
1207       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
1208 
1209       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
1210       // and if we end up going thru a c2i because of a miss a reasonable value of r13
1211       // will be generated.
1212       if (!r_2->is_valid()) {
1213         // sign extend???
1214         __ movl(r13, Address(saved_sp, ld_off));
1215         __ movptr(Address(rsp, st_off), r13);
1216       } else {
1217         //
1218         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1219         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1220         // So we must adjust where to pick up the data to match the interpreter.
1221         //
1222         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
1223         // are accessed as negative so LSW is at LOW address
1224 
1225         // ld_off is MSW so get LSW
1226         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1227                            next_off : ld_off;
1228         __ movq(r13, Address(saved_sp, offset));
1229         // st_off is LSW (i.e. reg.first())
1230         __ movq(Address(rsp, st_off), r13);
1231       }
1232     } else if (r_1->is_Register()) {  // Register argument
1233       Register r = r_1->as_Register();
1234       assert(r != rax, "must be different");
1235       if (r_2->is_valid()) {
1236         //
1237         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1238         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1239         // So we must adjust where to pick up the data to match the interpreter.
1240 
1241         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1242                            next_off : ld_off;
1243 
1244         // this can be a misaligned move
1245         __ movq(r, Address(saved_sp, offset));
1246       } else {
1247         // sign extend and use a full word?
1248         __ movl(r, Address(saved_sp, ld_off));
1249       }
1250     } else {
1251       if (!r_2->is_valid()) {
1252         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
1253       } else {
1254         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
1255       }
1256     }
1257   }
1258 
1259   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
1260 
1261   // 6243940 We might end up in handle_wrong_method if
1262   // the callee is deoptimized as we race thru here. If that
1263   // happens we don't want to take a safepoint because the
1264   // caller frame will look interpreted and arguments are now
1265   // "compiled" so it is much better to make this transition
1266   // invisible to the stack walking code. Unfortunately if
1267   // we try and find the callee by normal means a safepoint
1268   // is possible. So we stash the desired callee in the thread
1269   // and the vm will find there should this case occur.
1270 
1271   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1272 
1273   // put Method* where a c2i would expect should we end up there
1274   // only needed because of c2 resolve stubs return Method* as a result in
1275   // rax
1276   __ mov(rax, rbx);
1277   __ jmp(r11);
1278 }
1279 
1280 static void gen_inline_cache_check(MacroAssembler *masm, Label& skip_fixup) {
1281   Register data = rax;
1282   __ ic_check(1 /* end_alignment */);
1283   __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1284 
1285   // Method might have been compiled since the call site was patched to
1286   // interpreted if that is the case treat it as a miss so we can get
1287   // the call site corrected.
1288   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1289   __ jcc(Assembler::equal, skip_fixup);
1290   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1291 }
1292 
1293 // ---------------------------------------------------------------
1294 void SharedRuntime::generate_i2c2i_adapters(MacroAssembler* masm,
1295                                             int comp_args_on_stack,
1296                                             const GrowableArray<SigEntry>* sig,
1297                                             const VMRegPair* regs,
1298                                             const GrowableArray<SigEntry>* sig_cc,
1299                                             const VMRegPair* regs_cc,
1300                                             const GrowableArray<SigEntry>* sig_cc_ro,
1301                                             const VMRegPair* regs_cc_ro,
1302                                             address entry_address[AdapterBlob::ENTRY_COUNT],
1303                                             AdapterBlob*& new_adapter,
1304                                             bool allocate_code_blob) {
1305   entry_address[AdapterBlob::I2C] = __ pc();
1306   gen_i2c_adapter(masm, comp_args_on_stack, sig, regs);
1307 
1308   // -------------------------------------------------------------------------
1309   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1310   // to the interpreter.  The args start out packed in the compiled layout.  They
1311   // need to be unpacked into the interpreter layout.  This will almost always
1312   // require some stack space.  We grow the current (compiled) stack, then repack
1313   // the args.  We  finally end in a jump to the generic interpreter entry point.
1314   // On exit from the interpreter, the interpreter will restore our SP (lest the
1315   // compiled code, which relies solely on SP and not RBP, get sick).
1316 
1317   entry_address[AdapterBlob::C2I_Unverified] = __ pc();
1318   entry_address[AdapterBlob::C2I_Unverified_Inline] = __ pc();
1319   Label skip_fixup;
1320 
1321   gen_inline_cache_check(masm, skip_fixup);
1322 
1323   OopMapSet* oop_maps = new OopMapSet();
1324   int frame_complete = CodeOffsets::frame_never_safe;
1325   int frame_size_in_words = 0;
1326 
1327   // Scalarized c2i adapter with non-scalarized receiver (i.e., don't pack receiver)
1328   entry_address[AdapterBlob::C2I_No_Clinit_Check] = nullptr;
1329   entry_address[AdapterBlob::C2I_Inline_RO] = __ pc();
1330   if (regs_cc != regs_cc_ro) {
1331     // No class init barrier needed because method is guaranteed to be non-static
1332     gen_c2i_adapter(masm, sig_cc_ro, regs_cc_ro, /* requires_clinit_barrier = */ false, entry_address[AdapterBlob::C2I_No_Clinit_Check],
1333                     skip_fixup, entry_address[AdapterBlob::I2C], oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1334     skip_fixup.reset();
1335   }
1336 
1337   // Scalarized c2i adapter
1338   entry_address[AdapterBlob::C2I]        = __ pc();
1339   entry_address[AdapterBlob::C2I_Inline] = __ pc();
1340   gen_c2i_adapter(masm, sig_cc, regs_cc, /* requires_clinit_barrier = */ true, entry_address[AdapterBlob::C2I_No_Clinit_Check],
1341                   skip_fixup, entry_address[AdapterBlob::I2C], oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ true);
1342 
1343   // Non-scalarized c2i adapter
1344   if (regs != regs_cc) {
1345     entry_address[AdapterBlob::C2I_Unverified_Inline] = __ pc();
1346     Label inline_entry_skip_fixup;
1347     gen_inline_cache_check(masm, inline_entry_skip_fixup);
1348 
1349     entry_address[AdapterBlob::C2I_Inline] = __ pc();
1350     gen_c2i_adapter(masm, sig, regs, /* requires_clinit_barrier = */ true, entry_address[AdapterBlob::C2I_No_Clinit_Check],
1351                     inline_entry_skip_fixup, entry_address[AdapterBlob::I2C], oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1352   }
1353 
1354   // The c2i adapters might safepoint and trigger a GC. The caller must make sure that
1355   // the GC knows about the location of oop argument locations passed to the c2i adapter.
1356   if (allocate_code_blob) {
1357     bool caller_must_gc_arguments = (regs != regs_cc);
1358     int entry_offset[AdapterHandlerEntry::ENTRIES_COUNT];
1359     assert(AdapterHandlerEntry::ENTRIES_COUNT == 7, "sanity");
1360     AdapterHandlerLibrary::address_to_offset(entry_address, entry_offset);
1361     new_adapter = AdapterBlob::create(masm->code(), entry_offset, frame_complete, frame_size_in_words, oop_maps, caller_must_gc_arguments);
1362   }
1363 }
1364 
1365 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1366                                          VMRegPair *regs,
1367                                          int total_args_passed) {
1368 
1369 // We return the amount of VMRegImpl stack slots we need to reserve for all
1370 // the arguments NOT counting out_preserve_stack_slots.
1371 
1372 // NOTE: These arrays will have to change when c1 is ported
1373 #ifdef _WIN64
1374     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1375       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1376     };
1377     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1378       c_farg0, c_farg1, c_farg2, c_farg3
1379     };
1380 #else
1381     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1382       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1383     };
1384     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1385       c_farg0, c_farg1, c_farg2, c_farg3,
1386       c_farg4, c_farg5, c_farg6, c_farg7
1387     };
1388 #endif // _WIN64
1389 
1390 
1391     uint int_args = 0;
1392     uint fp_args = 0;
1393     uint stk_args = 0; // inc by 2 each time
1394 
1395     for (int i = 0; i < total_args_passed; i++) {
1396       switch (sig_bt[i]) {
1397       case T_BOOLEAN:
1398       case T_CHAR:
1399       case T_BYTE:
1400       case T_SHORT:
1401       case T_INT:
1402         if (int_args < Argument::n_int_register_parameters_c) {
1403           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1404 #ifdef _WIN64
1405           fp_args++;
1406           // Allocate slots for callee to stuff register args the stack.
1407           stk_args += 2;
1408 #endif
1409         } else {
1410           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1411           stk_args += 2;
1412         }
1413         break;
1414       case T_LONG:
1415         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1416         // fall through
1417       case T_OBJECT:
1418       case T_ARRAY:
1419       case T_ADDRESS:
1420       case T_METADATA:
1421         if (int_args < Argument::n_int_register_parameters_c) {
1422           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1423 #ifdef _WIN64
1424           fp_args++;
1425           stk_args += 2;
1426 #endif
1427         } else {
1428           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1429           stk_args += 2;
1430         }
1431         break;
1432       case T_FLOAT:
1433         if (fp_args < Argument::n_float_register_parameters_c) {
1434           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1435 #ifdef _WIN64
1436           int_args++;
1437           // Allocate slots for callee to stuff register args the stack.
1438           stk_args += 2;
1439 #endif
1440         } else {
1441           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1442           stk_args += 2;
1443         }
1444         break;
1445       case T_DOUBLE:
1446         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1447         if (fp_args < Argument::n_float_register_parameters_c) {
1448           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1449 #ifdef _WIN64
1450           int_args++;
1451           // Allocate slots for callee to stuff register args the stack.
1452           stk_args += 2;
1453 #endif
1454         } else {
1455           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1456           stk_args += 2;
1457         }
1458         break;
1459       case T_VOID: // Halves of longs and doubles
1460         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1461         regs[i].set_bad();
1462         break;
1463       default:
1464         ShouldNotReachHere();
1465         break;
1466       }
1467     }
1468 #ifdef _WIN64
1469   // windows abi requires that we always allocate enough stack space
1470   // for 4 64bit registers to be stored down.
1471   if (stk_args < 8) {
1472     stk_args = 8;
1473   }
1474 #endif // _WIN64
1475 
1476   return stk_args;
1477 }
1478 
1479 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1480                                              uint num_bits,
1481                                              uint total_args_passed) {
1482   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1483          "only certain vector sizes are supported for now");
1484 
1485   static const XMMRegister VEC_ArgReg[32] = {
1486      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1487      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1488     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1489     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1490   };
1491 
1492   uint stk_args = 0;
1493   uint fp_args = 0;
1494 
1495   for (uint i = 0; i < total_args_passed; i++) {
1496     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1497     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1498     regs[i].set_pair(vmreg->next(next_val), vmreg);
1499   }
1500 
1501   return stk_args;
1502 }
1503 
1504 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1505   // We always ignore the frame_slots arg and just use the space just below frame pointer
1506   // which by this time is free to use
1507   switch (ret_type) {
1508   case T_FLOAT:
1509     __ movflt(Address(rbp, -wordSize), xmm0);
1510     break;
1511   case T_DOUBLE:
1512     __ movdbl(Address(rbp, -wordSize), xmm0);
1513     break;
1514   case T_VOID:  break;
1515   default: {
1516     __ movptr(Address(rbp, -wordSize), rax);
1517     }
1518   }
1519 }
1520 
1521 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1522   // We always ignore the frame_slots arg and just use the space just below frame pointer
1523   // which by this time is free to use
1524   switch (ret_type) {
1525   case T_FLOAT:
1526     __ movflt(xmm0, Address(rbp, -wordSize));
1527     break;
1528   case T_DOUBLE:
1529     __ movdbl(xmm0, Address(rbp, -wordSize));
1530     break;
1531   case T_VOID:  break;
1532   default: {
1533     __ movptr(rax, Address(rbp, -wordSize));
1534     }
1535   }
1536 }
1537 
1538 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1539     for ( int i = first_arg ; i < arg_count ; i++ ) {
1540       if (args[i].first()->is_Register()) {
1541         __ push(args[i].first()->as_Register());
1542       } else if (args[i].first()->is_XMMRegister()) {
1543         __ subptr(rsp, 2*wordSize);
1544         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1545       }
1546     }
1547 }
1548 
1549 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1550     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1551       if (args[i].first()->is_Register()) {
1552         __ pop(args[i].first()->as_Register());
1553       } else if (args[i].first()->is_XMMRegister()) {
1554         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1555         __ addptr(rsp, 2*wordSize);
1556       }
1557     }
1558 }
1559 
1560 static void verify_oop_args(MacroAssembler* masm,
1561                             const methodHandle& method,
1562                             const BasicType* sig_bt,
1563                             const VMRegPair* regs) {
1564   Register temp_reg = rbx;  // not part of any compiled calling seq
1565   if (VerifyOops) {
1566     for (int i = 0; i < method->size_of_parameters(); i++) {
1567       if (is_reference_type(sig_bt[i])) {
1568         VMReg r = regs[i].first();
1569         assert(r->is_valid(), "bad oop arg");
1570         if (r->is_stack()) {
1571           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1572           __ verify_oop(temp_reg);
1573         } else {
1574           __ verify_oop(r->as_Register());
1575         }
1576       }
1577     }
1578   }
1579 }
1580 
1581 static void check_continuation_enter_argument(VMReg actual_vmreg,
1582                                               Register expected_reg,
1583                                               const char* name) {
1584   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1585   assert(actual_vmreg->as_Register() == expected_reg,
1586          "%s is in unexpected register: %s instead of %s",
1587          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1588 }
1589 
1590 
1591 //---------------------------- continuation_enter_setup ---------------------------
1592 //
1593 // Arguments:
1594 //   None.
1595 //
1596 // Results:
1597 //   rsp: pointer to blank ContinuationEntry
1598 //
1599 // Kills:
1600 //   rax
1601 //
1602 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1603   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1604   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1605   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1606 
1607   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1608   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1609 
1610   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1611   OopMap* map = new OopMap(frame_size, 0);
1612 
1613   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1614   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1615   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1616 
1617   return map;
1618 }
1619 
1620 //---------------------------- fill_continuation_entry ---------------------------
1621 //
1622 // Arguments:
1623 //   rsp: pointer to blank Continuation entry
1624 //   reg_cont_obj: pointer to the continuation
1625 //   reg_flags: flags
1626 //
1627 // Results:
1628 //   rsp: pointer to filled out ContinuationEntry
1629 //
1630 // Kills:
1631 //   rax
1632 //
1633 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1634   assert_different_registers(rax, reg_cont_obj, reg_flags);
1635 #ifdef ASSERT
1636   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1637 #endif
1638   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1639   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1640   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1641   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1642   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1643 
1644   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1645   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1646 
1647   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1648 }
1649 
1650 //---------------------------- continuation_enter_cleanup ---------------------------
1651 //
1652 // Arguments:
1653 //   rsp: pointer to the ContinuationEntry
1654 //
1655 // Results:
1656 //   rsp: pointer to the spilled rbp in the entry frame
1657 //
1658 // Kills:
1659 //   rbx
1660 //
1661 static void continuation_enter_cleanup(MacroAssembler* masm) {
1662 #ifdef ASSERT
1663   Label L_good_sp;
1664   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1665   __ jcc(Assembler::equal, L_good_sp);
1666   __ stop("Incorrect rsp at continuation_enter_cleanup");
1667   __ bind(L_good_sp);
1668 #endif
1669   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1670   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1671   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1672   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1673   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1674 }
1675 
1676 static void gen_continuation_enter(MacroAssembler* masm,
1677                                    const VMRegPair* regs,
1678                                    int& exception_offset,
1679                                    OopMapSet* oop_maps,
1680                                    int& frame_complete,
1681                                    int& stack_slots,
1682                                    int& interpreted_entry_offset,
1683                                    int& compiled_entry_offset) {
1684 
1685   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1686   int pos_cont_obj   = 0;
1687   int pos_is_cont    = 1;
1688   int pos_is_virtual = 2;
1689 
1690   // The platform-specific calling convention may present the arguments in various registers.
1691   // To simplify the rest of the code, we expect the arguments to reside at these known
1692   // registers, and we additionally check the placement here in case calling convention ever
1693   // changes.
1694   Register reg_cont_obj   = c_rarg1;
1695   Register reg_is_cont    = c_rarg2;
1696   Register reg_is_virtual = c_rarg3;
1697 
1698   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1699   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1700   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1701 
1702   // Utility methods kill rax, make sure there are no collisions
1703   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1704 
1705   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1706                          relocInfo::static_call_type);
1707 
1708   address start = __ pc();
1709 
1710   Label L_thaw, L_exit;
1711 
1712   // i2i entry used at interp_only_mode only
1713   interpreted_entry_offset = __ pc() - start;
1714   {
1715 #ifdef ASSERT
1716     Label is_interp_only;
1717     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1718     __ jcc(Assembler::notEqual, is_interp_only);
1719     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1720     __ bind(is_interp_only);
1721 #endif
1722 
1723     __ pop(rax); // return address
1724     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1725     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1726     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1727     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1728     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1729     __ push(rax); // return address
1730     __ push_cont_fastpath();
1731 
1732     __ enter();
1733 
1734     stack_slots = 2; // will be adjusted in setup
1735     OopMap* map = continuation_enter_setup(masm, stack_slots);
1736     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1737     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1738 
1739     __ verify_oop(reg_cont_obj);
1740 
1741     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1742 
1743     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1744     __ testptr(reg_is_cont, reg_is_cont);
1745     __ jcc(Assembler::notZero, L_thaw);
1746 
1747     // --- Resolve path
1748 
1749     // Make sure the call is patchable
1750     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1751     // Emit stub for static call
1752     address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1753     if (stub == nullptr) {
1754       fatal("CodeCache is full at gen_continuation_enter");
1755     }
1756     __ call(resolve);
1757     oop_maps->add_gc_map(__ pc() - start, map);
1758     __ post_call_nop();
1759 
1760     __ jmp(L_exit);
1761   }
1762 
1763   // compiled entry
1764   __ align(CodeEntryAlignment);
1765   compiled_entry_offset = __ pc() - start;
1766   __ enter();
1767 
1768   stack_slots = 2; // will be adjusted in setup
1769   OopMap* map = continuation_enter_setup(masm, stack_slots);
1770 
1771   // Frame is now completed as far as size and linkage.
1772   frame_complete = __ pc() - start;
1773 
1774   __ verify_oop(reg_cont_obj);
1775 
1776   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1777 
1778   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1779   __ testptr(reg_is_cont, reg_is_cont);
1780   __ jccb(Assembler::notZero, L_thaw);
1781 
1782   // --- call Continuation.enter(Continuation c, boolean isContinue)
1783 
1784   // Make sure the call is patchable
1785   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1786 
1787   // Emit stub for static call
1788   address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1789   if (stub == nullptr) {
1790     fatal("CodeCache is full at gen_continuation_enter");
1791   }
1792 
1793   // The call needs to be resolved. There's a special case for this in
1794   // SharedRuntime::find_callee_info_helper() which calls
1795   // LinkResolver::resolve_continuation_enter() which resolves the call to
1796   // Continuation.enter(Continuation c, boolean isContinue).
1797   __ call(resolve);
1798 
1799   oop_maps->add_gc_map(__ pc() - start, map);
1800   __ post_call_nop();
1801 
1802   __ jmpb(L_exit);
1803 
1804   // --- Thawing path
1805 
1806   __ bind(L_thaw);
1807 
1808   ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1809   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1810 
1811   ContinuationEntry::_return_pc_offset = __ pc() - start;
1812   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1813   __ post_call_nop();
1814 
1815   // --- Normal exit (resolve/thawing)
1816 
1817   __ bind(L_exit);
1818   ContinuationEntry::_cleanup_offset = __ pc() - start;
1819   continuation_enter_cleanup(masm);
1820   __ pop(rbp);
1821   __ ret(0);
1822 
1823   // --- Exception handling path
1824 
1825   exception_offset = __ pc() - start;
1826 
1827   continuation_enter_cleanup(masm);
1828   __ pop(rbp);
1829 
1830   __ movptr(c_rarg0, r15_thread);
1831   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1832 
1833   // rax still holds the original exception oop, save it before the call
1834   __ push(rax);
1835 
1836   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1837   __ movptr(rbx, rax);
1838 
1839   // Continue at exception handler:
1840   //   rax: exception oop
1841   //   rbx: exception handler
1842   //   rdx: exception pc
1843   __ pop(rax);
1844   __ verify_oop(rax);
1845   __ pop(rdx);
1846   __ jmp(rbx);
1847 }
1848 
1849 static void gen_continuation_yield(MacroAssembler* masm,
1850                                    const VMRegPair* regs,
1851                                    OopMapSet* oop_maps,
1852                                    int& frame_complete,
1853                                    int& stack_slots,
1854                                    int& compiled_entry_offset) {
1855   enum layout {
1856     rbp_off,
1857     rbpH_off,
1858     return_off,
1859     return_off2,
1860     framesize // inclusive of return address
1861   };
1862   stack_slots = framesize /  VMRegImpl::slots_per_word;
1863   assert(stack_slots == 2, "recheck layout");
1864 
1865   address start = __ pc();
1866   compiled_entry_offset = __ pc() - start;
1867   __ enter();
1868   address the_pc = __ pc();
1869 
1870   frame_complete = the_pc - start;
1871 
1872   // This nop must be exactly at the PC we push into the frame info.
1873   // We use this nop for fast CodeBlob lookup, associate the OopMap
1874   // with it right away.
1875   __ post_call_nop();
1876   OopMap* map = new OopMap(framesize, 1);
1877   oop_maps->add_gc_map(frame_complete, map);
1878 
1879   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1880   __ movptr(c_rarg0, r15_thread);
1881   __ movptr(c_rarg1, rsp);
1882   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1883   __ reset_last_Java_frame(true);
1884 
1885   Label L_pinned;
1886 
1887   __ testptr(rax, rax);
1888   __ jcc(Assembler::notZero, L_pinned);
1889 
1890   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1891   continuation_enter_cleanup(masm);
1892   __ pop(rbp);
1893   __ ret(0);
1894 
1895   __ bind(L_pinned);
1896 
1897   // Pinned, return to caller
1898 
1899   // handle pending exception thrown by freeze
1900   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1901   Label ok;
1902   __ jcc(Assembler::equal, ok);
1903   __ leave();
1904   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1905   __ bind(ok);
1906 
1907   __ leave();
1908   __ ret(0);
1909 }
1910 
1911 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1912   ::continuation_enter_cleanup(masm);
1913 }
1914 
1915 static void gen_special_dispatch(MacroAssembler* masm,
1916                                  const methodHandle& method,
1917                                  const BasicType* sig_bt,
1918                                  const VMRegPair* regs) {
1919   verify_oop_args(masm, method, sig_bt, regs);
1920   vmIntrinsics::ID iid = method->intrinsic_id();
1921 
1922   // Now write the args into the outgoing interpreter space
1923   bool     has_receiver   = false;
1924   Register receiver_reg   = noreg;
1925   int      member_arg_pos = -1;
1926   Register member_reg     = noreg;
1927   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1928   if (ref_kind != 0) {
1929     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1930     member_reg = rbx;  // known to be free at this point
1931     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1932   } else if (iid == vmIntrinsics::_invokeBasic) {
1933     has_receiver = true;
1934   } else if (iid == vmIntrinsics::_linkToNative) {
1935     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1936     member_reg = rbx;  // known to be free at this point
1937   } else {
1938     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1939   }
1940 
1941   if (member_reg != noreg) {
1942     // Load the member_arg into register, if necessary.
1943     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1944     VMReg r = regs[member_arg_pos].first();
1945     if (r->is_stack()) {
1946       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1947     } else {
1948       // no data motion is needed
1949       member_reg = r->as_Register();
1950     }
1951   }
1952 
1953   if (has_receiver) {
1954     // Make sure the receiver is loaded into a register.
1955     assert(method->size_of_parameters() > 0, "oob");
1956     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1957     VMReg r = regs[0].first();
1958     assert(r->is_valid(), "bad receiver arg");
1959     if (r->is_stack()) {
1960       // Porting note:  This assumes that compiled calling conventions always
1961       // pass the receiver oop in a register.  If this is not true on some
1962       // platform, pick a temp and load the receiver from stack.
1963       fatal("receiver always in a register");
1964       receiver_reg = j_rarg0;  // known to be free at this point
1965       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1966     } else {
1967       // no data motion is needed
1968       receiver_reg = r->as_Register();
1969     }
1970   }
1971 
1972   // Figure out which address we are really jumping to:
1973   MethodHandles::generate_method_handle_dispatch(masm, iid,
1974                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1975 }
1976 
1977 // ---------------------------------------------------------------------------
1978 // Generate a native wrapper for a given method.  The method takes arguments
1979 // in the Java compiled code convention, marshals them to the native
1980 // convention (handlizes oops, etc), transitions to native, makes the call,
1981 // returns to java state (possibly blocking), unhandlizes any result and
1982 // returns.
1983 //
1984 // Critical native functions are a shorthand for the use of
1985 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1986 // functions.  The wrapper is expected to unpack the arguments before
1987 // passing them to the callee. Critical native functions leave the state _in_Java,
1988 // since they cannot stop for GC.
1989 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1990 // block and the check for pending exceptions it's impossible for them
1991 // to be thrown.
1992 //
1993 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1994                                                 const methodHandle& method,
1995                                                 int compile_id,
1996                                                 BasicType* in_sig_bt,
1997                                                 VMRegPair* in_regs,
1998                                                 BasicType ret_type) {
1999   if (method->is_continuation_native_intrinsic()) {
2000     int exception_offset = -1;
2001     OopMapSet* oop_maps = new OopMapSet();
2002     int frame_complete = -1;
2003     int stack_slots = -1;
2004     int interpreted_entry_offset = -1;
2005     int vep_offset = -1;
2006     if (method->is_continuation_enter_intrinsic()) {
2007       gen_continuation_enter(masm,
2008                              in_regs,
2009                              exception_offset,
2010                              oop_maps,
2011                              frame_complete,
2012                              stack_slots,
2013                              interpreted_entry_offset,
2014                              vep_offset);
2015     } else if (method->is_continuation_yield_intrinsic()) {
2016       gen_continuation_yield(masm,
2017                              in_regs,
2018                              oop_maps,
2019                              frame_complete,
2020                              stack_slots,
2021                              vep_offset);
2022     } else {
2023       guarantee(false, "Unknown Continuation native intrinsic");
2024     }
2025 
2026 #ifdef ASSERT
2027     if (method->is_continuation_enter_intrinsic()) {
2028       assert(interpreted_entry_offset != -1, "Must be set");
2029       assert(exception_offset != -1,         "Must be set");
2030     } else {
2031       assert(interpreted_entry_offset == -1, "Must be unset");
2032       assert(exception_offset == -1,         "Must be unset");
2033     }
2034     assert(frame_complete != -1,    "Must be set");
2035     assert(stack_slots != -1,       "Must be set");
2036     assert(vep_offset != -1,        "Must be set");
2037 #endif
2038 
2039     __ flush();
2040     nmethod* nm = nmethod::new_native_nmethod(method,
2041                                               compile_id,
2042                                               masm->code(),
2043                                               vep_offset,
2044                                               frame_complete,
2045                                               stack_slots,
2046                                               in_ByteSize(-1),
2047                                               in_ByteSize(-1),
2048                                               oop_maps,
2049                                               exception_offset);
2050     if (nm == nullptr) return nm;
2051     if (method->is_continuation_enter_intrinsic()) {
2052       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
2053     } else if (method->is_continuation_yield_intrinsic()) {
2054       _cont_doYield_stub = nm;
2055     }
2056     return nm;
2057   }
2058 
2059   if (method->is_method_handle_intrinsic()) {
2060     vmIntrinsics::ID iid = method->intrinsic_id();
2061     intptr_t start = (intptr_t)__ pc();
2062     int vep_offset = ((intptr_t)__ pc()) - start;
2063     gen_special_dispatch(masm,
2064                          method,
2065                          in_sig_bt,
2066                          in_regs);
2067     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
2068     __ flush();
2069     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
2070     return nmethod::new_native_nmethod(method,
2071                                        compile_id,
2072                                        masm->code(),
2073                                        vep_offset,
2074                                        frame_complete,
2075                                        stack_slots / VMRegImpl::slots_per_word,
2076                                        in_ByteSize(-1),
2077                                        in_ByteSize(-1),
2078                                        nullptr);
2079   }
2080   address native_func = method->native_function();
2081   assert(native_func != nullptr, "must have function");
2082 
2083   // An OopMap for lock (and class if static)
2084   OopMapSet *oop_maps = new OopMapSet();
2085   intptr_t start = (intptr_t)__ pc();
2086 
2087   // We have received a description of where all the java arg are located
2088   // on entry to the wrapper. We need to convert these args to where
2089   // the jni function will expect them. To figure out where they go
2090   // we convert the java signature to a C signature by inserting
2091   // the hidden arguments as arg[0] and possibly arg[1] (static method)
2092 
2093   const int total_in_args = method->size_of_parameters();
2094   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
2095 
2096   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
2097   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
2098 
2099   int argc = 0;
2100   out_sig_bt[argc++] = T_ADDRESS;
2101   if (method->is_static()) {
2102     out_sig_bt[argc++] = T_OBJECT;
2103   }
2104 
2105   for (int i = 0; i < total_in_args ; i++ ) {
2106     out_sig_bt[argc++] = in_sig_bt[i];
2107   }
2108 
2109   // Now figure out where the args must be stored and how much stack space
2110   // they require.
2111   int out_arg_slots;
2112   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
2113 
2114   // Compute framesize for the wrapper.  We need to handlize all oops in
2115   // incoming registers
2116 
2117   // Calculate the total number of stack slots we will need.
2118 
2119   // First count the abi requirement plus all of the outgoing args
2120   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
2121 
2122   // Now the space for the inbound oop handle area
2123   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
2124 
2125   int oop_handle_offset = stack_slots;
2126   stack_slots += total_save_slots;
2127 
2128   // Now any space we need for handlizing a klass if static method
2129 
2130   int klass_slot_offset = 0;
2131   int klass_offset = -1;
2132   int lock_slot_offset = 0;
2133   bool is_static = false;
2134 
2135   if (method->is_static()) {
2136     klass_slot_offset = stack_slots;
2137     stack_slots += VMRegImpl::slots_per_word;
2138     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
2139     is_static = true;
2140   }
2141 
2142   // Plus a lock if needed
2143 
2144   if (method->is_synchronized()) {
2145     lock_slot_offset = stack_slots;
2146     stack_slots += VMRegImpl::slots_per_word;
2147   }
2148 
2149   // Now a place (+2) to save return values or temp during shuffling
2150   // + 4 for return address (which we own) and saved rbp
2151   stack_slots += 6;
2152 
2153   // Ok The space we have allocated will look like:
2154   //
2155   //
2156   // FP-> |                     |
2157   //      |---------------------|
2158   //      | 2 slots for moves   |
2159   //      |---------------------|
2160   //      | lock box (if sync)  |
2161   //      |---------------------| <- lock_slot_offset
2162   //      | klass (if static)   |
2163   //      |---------------------| <- klass_slot_offset
2164   //      | oopHandle area      |
2165   //      |---------------------| <- oop_handle_offset (6 java arg registers)
2166   //      | outbound memory     |
2167   //      | based arguments     |
2168   //      |                     |
2169   //      |---------------------|
2170   //      |                     |
2171   // SP-> | out_preserved_slots |
2172   //
2173   //
2174 
2175 
2176   // Now compute actual number of stack words we need rounding to make
2177   // stack properly aligned.
2178   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
2179 
2180   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
2181 
2182   // First thing make an ic check to see if we should even be here
2183 
2184   // We are free to use all registers as temps without saving them and
2185   // restoring them except rbp. rbp is the only callee save register
2186   // as far as the interpreter and the compiler(s) are concerned.
2187 
2188   const Register receiver = j_rarg0;
2189 
2190   Label exception_pending;
2191 
2192   assert_different_registers(receiver, rscratch1, rscratch2);
2193   __ verify_oop(receiver);
2194   __ ic_check(8 /* end_alignment */);
2195 
2196   int vep_offset = ((intptr_t)__ pc()) - start;
2197 
2198   if (method->needs_clinit_barrier()) {
2199     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
2200     Label L_skip_barrier;
2201     Register klass = r10;
2202     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
2203     __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
2204 
2205     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
2206 
2207     __ bind(L_skip_barrier);
2208   }
2209 
2210 #ifdef COMPILER1
2211   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2212   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2213     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2214   }
2215 #endif // COMPILER1
2216 
2217   // The instruction at the verified entry point must be 5 bytes or longer
2218   // because it can be patched on the fly by make_non_entrant. The stack bang
2219   // instruction fits that requirement.
2220 
2221   // Generate stack overflow check
2222   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
2223 
2224   // Generate a new frame for the wrapper.
2225   __ enter();
2226   // -2 because return address is already present and so is saved rbp
2227   __ subptr(rsp, stack_size - 2*wordSize);
2228 
2229   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2230   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
2231   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
2232 
2233   // Frame is now completed as far as size and linkage.
2234   int frame_complete = ((intptr_t)__ pc()) - start;
2235 
2236 #ifdef ASSERT
2237   __ check_stack_alignment(rsp, "improperly aligned stack");
2238 #endif /* ASSERT */
2239 
2240 
2241   // We use r14 as the oop handle for the receiver/klass
2242   // It is callee save so it survives the call to native
2243 
2244   const Register oop_handle_reg = r14;
2245 
2246   //
2247   // We immediately shuffle the arguments so that any vm call we have to
2248   // make from here on out (sync slow path, jvmti, etc.) we will have
2249   // captured the oops from our caller and have a valid oopMap for
2250   // them.
2251 
2252   // -----------------
2253   // The Grand Shuffle
2254 
2255   // The Java calling convention is either equal (linux) or denser (win64) than the
2256   // c calling convention. However the because of the jni_env argument the c calling
2257   // convention always has at least one more (and two for static) arguments than Java.
2258   // Therefore if we move the args from java -> c backwards then we will never have
2259   // a register->register conflict and we don't have to build a dependency graph
2260   // and figure out how to break any cycles.
2261   //
2262 
2263   // Record esp-based slot for receiver on stack for non-static methods
2264   int receiver_offset = -1;
2265 
2266   // This is a trick. We double the stack slots so we can claim
2267   // the oops in the caller's frame. Since we are sure to have
2268   // more args than the caller doubling is enough to make
2269   // sure we can capture all the incoming oop args from the
2270   // caller.
2271   //
2272   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2273 
2274   // Mark location of rbp (someday)
2275   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2276 
2277   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2278   // All inbound args are referenced based on rbp and all outbound args via rsp.
2279 
2280 
2281 #ifdef ASSERT
2282   bool reg_destroyed[Register::number_of_registers];
2283   bool freg_destroyed[XMMRegister::number_of_registers];
2284   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2285     reg_destroyed[r] = false;
2286   }
2287   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2288     freg_destroyed[f] = false;
2289   }
2290 
2291 #endif /* ASSERT */
2292 
2293   // For JNI natives the incoming and outgoing registers are offset upwards.
2294   GrowableArray<int> arg_order(2 * total_in_args);
2295 
2296   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2297     arg_order.push(i);
2298     arg_order.push(c_arg);
2299   }
2300 
2301   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2302     int i = arg_order.at(ai);
2303     int c_arg = arg_order.at(ai + 1);
2304     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2305 #ifdef ASSERT
2306     if (in_regs[i].first()->is_Register()) {
2307       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2308     } else if (in_regs[i].first()->is_XMMRegister()) {
2309       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2310     }
2311     if (out_regs[c_arg].first()->is_Register()) {
2312       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2313     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2314       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2315     }
2316 #endif /* ASSERT */
2317     switch (in_sig_bt[i]) {
2318       case T_ARRAY:
2319       case T_OBJECT:
2320         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2321                     ((i == 0) && (!is_static)),
2322                     &receiver_offset);
2323         break;
2324       case T_VOID:
2325         break;
2326 
2327       case T_FLOAT:
2328         __ float_move(in_regs[i], out_regs[c_arg]);
2329           break;
2330 
2331       case T_DOUBLE:
2332         assert( i + 1 < total_in_args &&
2333                 in_sig_bt[i + 1] == T_VOID &&
2334                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2335         __ double_move(in_regs[i], out_regs[c_arg]);
2336         break;
2337 
2338       case T_LONG :
2339         __ long_move(in_regs[i], out_regs[c_arg]);
2340         break;
2341 
2342       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2343 
2344       default:
2345         __ move32_64(in_regs[i], out_regs[c_arg]);
2346     }
2347   }
2348 
2349   int c_arg;
2350 
2351   // Pre-load a static method's oop into r14.  Used both by locking code and
2352   // the normal JNI call code.
2353   // point c_arg at the first arg that is already loaded in case we
2354   // need to spill before we call out
2355   c_arg = total_c_args - total_in_args;
2356 
2357   if (method->is_static()) {
2358 
2359     //  load oop into a register
2360     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2361 
2362     // Now handlize the static class mirror it's known not-null.
2363     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2364     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2365 
2366     // Now get the handle
2367     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2368     // store the klass handle as second argument
2369     __ movptr(c_rarg1, oop_handle_reg);
2370     // and protect the arg if we must spill
2371     c_arg--;
2372   }
2373 
2374   // Change state to native (we save the return address in the thread, since it might not
2375   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2376   // points into the right code segment. It does not have to be the correct return pc.
2377   // We use the same pc/oopMap repeatedly when we call out
2378 
2379   Label native_return;
2380   if (method->is_object_wait0()) {
2381     // For convenience we use the pc we want to resume to in case of preemption on Object.wait.
2382     __ set_last_Java_frame(rsp, noreg, native_return, rscratch1);
2383   } else {
2384     intptr_t the_pc = (intptr_t) __ pc();
2385     oop_maps->add_gc_map(the_pc - start, map);
2386 
2387     __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1);
2388   }
2389 
2390   // We have all of the arguments setup at this point. We must not touch any register
2391   // argument registers at this point (what if we save/restore them there are no oop?
2392 
2393   if (DTraceMethodProbes) {
2394     // protect the args we've loaded
2395     save_args(masm, total_c_args, c_arg, out_regs);
2396     __ mov_metadata(c_rarg1, method());
2397     __ call_VM_leaf(
2398       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2399       r15_thread, c_rarg1);
2400     restore_args(masm, total_c_args, c_arg, out_regs);
2401   }
2402 
2403   // RedefineClasses() tracing support for obsolete method entry
2404   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2405     // protect the args we've loaded
2406     save_args(masm, total_c_args, c_arg, out_regs);
2407     __ mov_metadata(c_rarg1, method());
2408     __ call_VM_leaf(
2409       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2410       r15_thread, c_rarg1);
2411     restore_args(masm, total_c_args, c_arg, out_regs);
2412   }
2413 
2414   // Lock a synchronized method
2415 
2416   // Register definitions used by locking and unlocking
2417 
2418   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2419   const Register obj_reg  = rbx;  // Will contain the oop
2420   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2421 
2422   Label slow_path_lock;
2423   Label lock_done;
2424 
2425   if (method->is_synchronized()) {
2426     // Get the handle (the 2nd argument)
2427     __ mov(oop_handle_reg, c_rarg1);
2428 
2429     // Get address of the box
2430 
2431     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2432 
2433     // Load the oop from the handle
2434     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2435 
2436     __ fast_lock(lock_reg, obj_reg, swap_reg, rscratch1, slow_path_lock);
2437 
2438     // Slow path will re-enter here
2439     __ bind(lock_done);
2440   }
2441 
2442   // Finally just about ready to make the JNI call
2443 
2444   // get JNIEnv* which is first argument to native
2445   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2446 
2447   // Now set thread in native
2448   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2449 
2450   __ call(RuntimeAddress(native_func));
2451 
2452   // Verify or restore cpu control state after JNI call
2453   __ restore_cpu_control_state_after_jni(rscratch1);
2454 
2455   // Unpack native results.
2456   switch (ret_type) {
2457   case T_BOOLEAN: __ c2bool(rax);            break;
2458   case T_CHAR   : __ movzwl(rax, rax);      break;
2459   case T_BYTE   : __ sign_extend_byte (rax); break;
2460   case T_SHORT  : __ sign_extend_short(rax); break;
2461   case T_INT    : /* nothing to do */        break;
2462   case T_DOUBLE :
2463   case T_FLOAT  :
2464     // Result is in xmm0 we'll save as needed
2465     break;
2466   case T_ARRAY:                 // Really a handle
2467   case T_OBJECT:                // Really a handle
2468       break; // can't de-handlize until after safepoint check
2469   case T_VOID: break;
2470   case T_LONG: break;
2471   default       : ShouldNotReachHere();
2472   }
2473 
2474   // Switch thread to "native transition" state before reading the synchronization state.
2475   // This additional state is necessary because reading and testing the synchronization
2476   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2477   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2478   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2479   //     Thread A is resumed to finish this native method, but doesn't block here since it
2480   //     didn't see any synchronization is progress, and escapes.
2481   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2482 
2483   // Force this write out before the read below
2484   if (!UseSystemMemoryBarrier) {
2485     __ membar(Assembler::Membar_mask_bits(
2486               Assembler::LoadLoad | Assembler::LoadStore |
2487               Assembler::StoreLoad | Assembler::StoreStore));
2488   }
2489 
2490   // check for safepoint operation in progress and/or pending suspend requests
2491   {
2492     Label Continue;
2493     Label slow_path;
2494 
2495     __ safepoint_poll(slow_path, true /* at_return */, false /* in_nmethod */);
2496 
2497     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2498     __ jcc(Assembler::equal, Continue);
2499     __ bind(slow_path);
2500 
2501     // Don't use call_VM as it will see a possible pending exception and forward it
2502     // and never return here preventing us from clearing _last_native_pc down below.
2503     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2504     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2505     // by hand.
2506     //
2507     __ vzeroupper();
2508     save_native_result(masm, ret_type, stack_slots);
2509     __ mov(c_rarg0, r15_thread);
2510     __ mov(r12, rsp); // remember sp
2511     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2512     __ andptr(rsp, -16); // align stack as required by ABI
2513     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2514     __ mov(rsp, r12); // restore sp
2515     __ reinit_heapbase();
2516     // Restore any method result value
2517     restore_native_result(masm, ret_type, stack_slots);
2518     __ bind(Continue);
2519   }
2520 
2521   // change thread state
2522   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2523 
2524   if (method->is_object_wait0()) {
2525     // Check preemption for Object.wait()
2526     __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset()));
2527     __ cmpptr(rscratch1, NULL_WORD);
2528     __ jccb(Assembler::equal, native_return);
2529     __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD);
2530     __ jmp(rscratch1);
2531     __ bind(native_return);
2532 
2533     intptr_t the_pc = (intptr_t) __ pc();
2534     oop_maps->add_gc_map(the_pc - start, map);
2535   }
2536 
2537 
2538   Label reguard;
2539   Label reguard_done;
2540   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2541   __ jcc(Assembler::equal, reguard);
2542   __ bind(reguard_done);
2543 
2544   // native result if any is live
2545 
2546   // Unlock
2547   Label slow_path_unlock;
2548   Label unlock_done;
2549   if (method->is_synchronized()) {
2550 
2551     Label fast_done;
2552 
2553     // Get locked oop from the handle we passed to jni
2554     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2555 
2556     // Must save rax if it is live now because cmpxchg must use it
2557     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2558       save_native_result(masm, ret_type, stack_slots);
2559     }
2560 
2561     __ fast_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2562 
2563     // slow path re-enters here
2564     __ bind(unlock_done);
2565     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2566       restore_native_result(masm, ret_type, stack_slots);
2567     }
2568 
2569     __ bind(fast_done);
2570   }
2571   if (DTraceMethodProbes) {
2572     save_native_result(masm, ret_type, stack_slots);
2573     __ mov_metadata(c_rarg1, method());
2574     __ call_VM_leaf(
2575          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2576          r15_thread, c_rarg1);
2577     restore_native_result(masm, ret_type, stack_slots);
2578   }
2579 
2580   __ reset_last_Java_frame(false);
2581 
2582   // Unbox oop result, e.g. JNIHandles::resolve value.
2583   if (is_reference_type(ret_type)) {
2584     __ resolve_jobject(rax /* value */,
2585                        rcx /* tmp */);
2586   }
2587 
2588   if (CheckJNICalls) {
2589     // clear_pending_jni_exception_check
2590     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2591   }
2592 
2593   // reset handle block
2594   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2595   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2596 
2597   // pop our frame
2598 
2599   __ leave();
2600 
2601 #if INCLUDE_JFR
2602   // We need to do a poll test after unwind in case the sampler
2603   // managed to sample the native frame after returning to Java.
2604   Label L_return;
2605   address poll_test_pc = __ pc();
2606   __ relocate(relocInfo::poll_return_type);
2607   __ testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2608   __ jccb(Assembler::zero, L_return);
2609   __ lea(rscratch1, InternalAddress(poll_test_pc));
2610   __ movptr(Address(r15_thread, JavaThread::saved_exception_pc_offset()), rscratch1);
2611   assert(SharedRuntime::polling_page_return_handler_blob() != nullptr,
2612     "polling page return stub not created yet");
2613   address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
2614   __ jump(RuntimeAddress(stub));
2615   __ bind(L_return);
2616 #endif // INCLUDE_JFR
2617 
2618   // Any exception pending?
2619   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2620   __ jcc(Assembler::notEqual, exception_pending);
2621 
2622   // Return
2623 
2624   __ ret(0);
2625 
2626   // Unexpected paths are out of line and go here
2627 
2628   // forward the exception
2629   __ bind(exception_pending);
2630 
2631   // and forward the exception
2632   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2633 
2634   // Slow path locking & unlocking
2635   if (method->is_synchronized()) {
2636 
2637     // BEGIN Slow path lock
2638     __ bind(slow_path_lock);
2639 
2640     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2641     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2642 
2643     // protect the args we've loaded
2644     save_args(masm, total_c_args, c_arg, out_regs);
2645 
2646     __ mov(c_rarg0, obj_reg);
2647     __ mov(c_rarg1, lock_reg);
2648     __ mov(c_rarg2, r15_thread);
2649 
2650     // Not a leaf but we have last_Java_frame setup as we want.
2651     // We don't want to unmount in case of contention since that would complicate preserving
2652     // the arguments that had already been marshalled into the native convention. So we force
2653     // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame())
2654     // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack.
2655     __ push_cont_fastpath();
2656     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2657     __ pop_cont_fastpath();
2658     restore_args(masm, total_c_args, c_arg, out_regs);
2659 
2660 #ifdef ASSERT
2661     { Label L;
2662     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2663     __ jcc(Assembler::equal, L);
2664     __ stop("no pending exception allowed on exit from monitorenter");
2665     __ bind(L);
2666     }
2667 #endif
2668     __ jmp(lock_done);
2669 
2670     // END Slow path lock
2671 
2672     // BEGIN Slow path unlock
2673     __ bind(slow_path_unlock);
2674 
2675     // If we haven't already saved the native result we must save it now as xmm registers
2676     // are still exposed.
2677     __ vzeroupper();
2678     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2679       save_native_result(masm, ret_type, stack_slots);
2680     }
2681 
2682     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2683 
2684     __ mov(c_rarg0, obj_reg);
2685     __ mov(c_rarg2, r15_thread);
2686     __ mov(r12, rsp); // remember sp
2687     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2688     __ andptr(rsp, -16); // align stack as required by ABI
2689 
2690     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2691     // NOTE that obj_reg == rbx currently
2692     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2693     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2694 
2695     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2696     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2697     __ mov(rsp, r12); // restore sp
2698     __ reinit_heapbase();
2699 #ifdef ASSERT
2700     {
2701       Label L;
2702       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2703       __ jcc(Assembler::equal, L);
2704       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2705       __ bind(L);
2706     }
2707 #endif /* ASSERT */
2708 
2709     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2710 
2711     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2712       restore_native_result(masm, ret_type, stack_slots);
2713     }
2714     __ jmp(unlock_done);
2715 
2716     // END Slow path unlock
2717 
2718   } // synchronized
2719 
2720   // SLOW PATH Reguard the stack if needed
2721 
2722   __ bind(reguard);
2723   __ vzeroupper();
2724   save_native_result(masm, ret_type, stack_slots);
2725   __ mov(r12, rsp); // remember sp
2726   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2727   __ andptr(rsp, -16); // align stack as required by ABI
2728   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2729   __ mov(rsp, r12); // restore sp
2730   __ reinit_heapbase();
2731   restore_native_result(masm, ret_type, stack_slots);
2732   // and continue
2733   __ jmp(reguard_done);
2734 
2735 
2736 
2737   __ flush();
2738 
2739   nmethod *nm = nmethod::new_native_nmethod(method,
2740                                             compile_id,
2741                                             masm->code(),
2742                                             vep_offset,
2743                                             frame_complete,
2744                                             stack_slots / VMRegImpl::slots_per_word,
2745                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2746                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2747                                             oop_maps);
2748 
2749   return nm;
2750 }
2751 
2752 // this function returns the adjust size (in number of words) to a c2i adapter
2753 // activation for use during deoptimization
2754 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2755   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2756 }
2757 
2758 
2759 uint SharedRuntime::out_preserve_stack_slots() {
2760   return 0;
2761 }
2762 
2763 
2764 // Number of stack slots between incoming argument block and the start of
2765 // a new frame.  The PROLOG must add this many slots to the stack.  The
2766 // EPILOG must remove this many slots.  amd64 needs two slots for
2767 // return address.
2768 uint SharedRuntime::in_preserve_stack_slots() {
2769   return 4 + 2 * VerifyStackAtCalls;
2770 }
2771 
2772 VMReg SharedRuntime::thread_register() {
2773   return r15_thread->as_VMReg();
2774 }
2775 
2776 //------------------------------generate_deopt_blob----------------------------
2777 void SharedRuntime::generate_deopt_blob() {
2778   // Allocate space for the code
2779   ResourceMark rm;
2780   // Setup code generation tools
2781   int pad = 0;
2782   if (UseAVX > 2) {
2783     pad += 1024;
2784   }
2785   if (UseAPX) {
2786     pad += 1024;
2787   }
2788 #if INCLUDE_JVMCI
2789   if (EnableJVMCI) {
2790     pad += 512; // Increase the buffer size when compiling for JVMCI
2791   }
2792 #endif
2793   const char* name = SharedRuntime::stub_name(StubId::shared_deopt_id);
2794   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
2795   if (blob != nullptr) {
2796     _deopt_blob = blob->as_deoptimization_blob();
2797     return;
2798   }
2799 
2800   CodeBuffer buffer(name, 2560+pad, 1024);
2801   MacroAssembler* masm = new MacroAssembler(&buffer);
2802   int frame_size_in_words;
2803   OopMap* map = nullptr;
2804   OopMapSet *oop_maps = new OopMapSet();
2805 
2806   // -------------
2807   // This code enters when returning to a de-optimized nmethod.  A return
2808   // address has been pushed on the stack, and return values are in
2809   // registers.
2810   // If we are doing a normal deopt then we were called from the patched
2811   // nmethod from the point we returned to the nmethod. So the return
2812   // address on the stack is wrong by NativeCall::instruction_size
2813   // We will adjust the value so it looks like we have the original return
2814   // address on the stack (like when we eagerly deoptimized).
2815   // In the case of an exception pending when deoptimizing, we enter
2816   // with a return address on the stack that points after the call we patched
2817   // into the exception handler. We have the following register state from,
2818   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2819   //    rax: exception oop
2820   //    rbx: exception handler
2821   //    rdx: throwing pc
2822   // So in this case we simply jam rdx into the useless return address and
2823   // the stack looks just like we want.
2824   //
2825   // At this point we need to de-opt.  We save the argument return
2826   // registers.  We call the first C routine, fetch_unroll_info().  This
2827   // routine captures the return values and returns a structure which
2828   // describes the current frame size and the sizes of all replacement frames.
2829   // The current frame is compiled code and may contain many inlined
2830   // functions, each with their own JVM state.  We pop the current frame, then
2831   // push all the new frames.  Then we call the C routine unpack_frames() to
2832   // populate these frames.  Finally unpack_frames() returns us the new target
2833   // address.  Notice that callee-save registers are BLOWN here; they have
2834   // already been captured in the vframeArray at the time the return PC was
2835   // patched.
2836   address start = __ pc();
2837   Label cont;
2838 
2839   // Prolog for non exception case!
2840 
2841   // Save everything in sight.
2842   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2843 
2844   // Normal deoptimization.  Save exec mode for unpack_frames.
2845   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2846   __ jmp(cont);
2847 
2848   int reexecute_offset = __ pc() - start;
2849 #if INCLUDE_JVMCI && !defined(COMPILER1)
2850   if (UseJVMCICompiler) {
2851     // JVMCI does not use this kind of deoptimization
2852     __ should_not_reach_here();
2853   }
2854 #endif
2855 
2856   // Reexecute case
2857   // return address is the pc describes what bci to do re-execute at
2858 
2859   // No need to update map as each call to save_live_registers will produce identical oopmap
2860   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2861 
2862   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2863   __ jmp(cont);
2864 
2865 #if INCLUDE_JVMCI
2866   Label after_fetch_unroll_info_call;
2867   int implicit_exception_uncommon_trap_offset = 0;
2868   int uncommon_trap_offset = 0;
2869 
2870   if (EnableJVMCI) {
2871     implicit_exception_uncommon_trap_offset = __ pc() - start;
2872 
2873     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2874     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2875 
2876     uncommon_trap_offset = __ pc() - start;
2877 
2878     // Save everything in sight.
2879     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2880     // fetch_unroll_info needs to call last_java_frame()
2881     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2882 
2883     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2884     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2885 
2886     __ movl(r14, Deoptimization::Unpack_reexecute);
2887     __ mov(c_rarg0, r15_thread);
2888     __ movl(c_rarg2, r14); // exec mode
2889     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2890     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2891 
2892     __ reset_last_Java_frame(false);
2893 
2894     __ jmp(after_fetch_unroll_info_call);
2895   } // EnableJVMCI
2896 #endif // INCLUDE_JVMCI
2897 
2898   int exception_offset = __ pc() - start;
2899 
2900   // Prolog for exception case
2901 
2902   // all registers are dead at this entry point, except for rax, and
2903   // rdx which contain the exception oop and exception pc
2904   // respectively.  Set them in TLS and fall thru to the
2905   // unpack_with_exception_in_tls entry point.
2906 
2907   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2908   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2909 
2910   int exception_in_tls_offset = __ pc() - start;
2911 
2912   // new implementation because exception oop is now passed in JavaThread
2913 
2914   // Prolog for exception case
2915   // All registers must be preserved because they might be used by LinearScan
2916   // Exceptiop oop and throwing PC are passed in JavaThread
2917   // tos: stack at point of call to method that threw the exception (i.e. only
2918   // args are on the stack, no return address)
2919 
2920   // make room on stack for the return address
2921   // It will be patched later with the throwing pc. The correct value is not
2922   // available now because loading it from memory would destroy registers.
2923   __ push(0);
2924 
2925   // Save everything in sight.
2926   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2927 
2928   // Now it is safe to overwrite any register
2929 
2930   // Deopt during an exception.  Save exec mode for unpack_frames.
2931   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2932 
2933   // load throwing pc from JavaThread and patch it as the return address
2934   // of the current frame. Then clear the field in JavaThread
2935 
2936   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2937   __ movptr(Address(rbp, wordSize), rdx);
2938   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2939 
2940 #ifdef ASSERT
2941   // verify that there is really an exception oop in JavaThread
2942   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2943   __ verify_oop(rax);
2944 
2945   // verify that there is no pending exception
2946   Label no_pending_exception;
2947   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2948   __ testptr(rax, rax);
2949   __ jcc(Assembler::zero, no_pending_exception);
2950   __ stop("must not have pending exception here");
2951   __ bind(no_pending_exception);
2952 #endif
2953 
2954   __ bind(cont);
2955 
2956   // Call C code.  Need thread and this frame, but NOT official VM entry
2957   // crud.  We cannot block on this call, no GC can happen.
2958   //
2959   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2960 
2961   // fetch_unroll_info needs to call last_java_frame().
2962 
2963   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2964 #ifdef ASSERT
2965   { Label L;
2966     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2967     __ jcc(Assembler::equal, L);
2968     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2969     __ bind(L);
2970   }
2971 #endif // ASSERT
2972   __ mov(c_rarg0, r15_thread);
2973   __ movl(c_rarg1, r14); // exec_mode
2974   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2975 
2976   // Need to have an oopmap that tells fetch_unroll_info where to
2977   // find any register it might need.
2978   oop_maps->add_gc_map(__ pc() - start, map);
2979 
2980   __ reset_last_Java_frame(false);
2981 
2982 #if INCLUDE_JVMCI
2983   if (EnableJVMCI) {
2984     __ bind(after_fetch_unroll_info_call);
2985   }
2986 #endif
2987 
2988   // Load UnrollBlock* into rdi
2989   __ mov(rdi, rax);
2990 
2991   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2992    Label noException;
2993   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2994   __ jcc(Assembler::notEqual, noException);
2995   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2996   // QQQ this is useless it was null above
2997   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2998   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2999   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3000 
3001   __ verify_oop(rax);
3002 
3003   // Overwrite the result registers with the exception results.
3004   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3005   // I think this is useless
3006   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
3007 
3008   __ bind(noException);
3009 
3010   // Only register save data is on the stack.
3011   // Now restore the result registers.  Everything else is either dead
3012   // or captured in the vframeArray.
3013   RegisterSaver::restore_result_registers(masm);
3014 
3015   // All of the register save area has been popped of the stack. Only the
3016   // return address remains.
3017 
3018   // Pop all the frames we must move/replace.
3019   //
3020   // Frame picture (youngest to oldest)
3021   // 1: self-frame (no frame link)
3022   // 2: deopting frame  (no frame link)
3023   // 3: caller of deopting frame (could be compiled/interpreted).
3024   //
3025   // Note: by leaving the return address of self-frame on the stack
3026   // and using the size of frame 2 to adjust the stack
3027   // when we are done the return to frame 3 will still be on the stack.
3028 
3029   // Pop deoptimized frame
3030   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
3031   __ addptr(rsp, rcx);
3032 
3033   // rsp should be pointing at the return address to the caller (3)
3034 
3035   // Pick up the initial fp we should save
3036   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3037   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
3038 
3039 #ifdef ASSERT
3040   // Compilers generate code that bang the stack by as much as the
3041   // interpreter would need. So this stack banging should never
3042   // trigger a fault. Verify that it does not on non product builds.
3043   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
3044   __ bang_stack_size(rbx, rcx);
3045 #endif
3046 
3047   // Load address of array of frame pcs into rcx
3048   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
3049 
3050   // Trash the old pc
3051   __ addptr(rsp, wordSize);
3052 
3053   // Load address of array of frame sizes into rsi
3054   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
3055 
3056   // Load counter into rdx
3057   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
3058 
3059   // Now adjust the caller's stack to make up for the extra locals
3060   // but record the original sp so that we can save it in the skeletal interpreter
3061   // frame and the stack walking of interpreter_sender will get the unextended sp
3062   // value and not the "real" sp value.
3063 
3064   const Register sender_sp = r8;
3065 
3066   __ mov(sender_sp, rsp);
3067   __ movl(rbx, Address(rdi,
3068                        Deoptimization::UnrollBlock::
3069                        caller_adjustment_offset()));
3070   __ subptr(rsp, rbx);
3071 
3072   // Push interpreter frames in a loop
3073   Label loop;
3074   __ bind(loop);
3075   __ movptr(rbx, Address(rsi, 0));      // Load frame size
3076   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
3077   __ pushptr(Address(rcx, 0));          // Save return address
3078   __ enter();                           // Save old & set new ebp
3079   __ subptr(rsp, rbx);                  // Prolog
3080   // This value is corrected by layout_activation_impl
3081   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3082   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
3083   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
3084   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
3085   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
3086   __ decrementl(rdx);                   // Decrement counter
3087   __ jcc(Assembler::notZero, loop);
3088   __ pushptr(Address(rcx, 0));          // Save final return address
3089 
3090   // Re-push self-frame
3091   __ enter();                           // Save old & set new ebp
3092 
3093   // Allocate a full sized register save area.
3094   // Return address and rbp are in place, so we allocate two less words.
3095   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
3096 
3097   // Restore frame locals after moving the frame
3098   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
3099   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3100 
3101   // Call C code.  Need thread but NOT official VM entry
3102   // crud.  We cannot block on this call, no GC can happen.  Call should
3103   // restore return values to their stack-slots with the new SP.
3104   //
3105   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
3106 
3107   // Use rbp because the frames look interpreted now
3108   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3109   // Don't need the precise return PC here, just precise enough to point into this code blob.
3110   address the_pc = __ pc();
3111   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3112 
3113   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
3114   __ mov(c_rarg0, r15_thread);
3115   __ movl(c_rarg1, r14); // second arg: exec_mode
3116   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3117   // Revert SP alignment after call since we're going to do some SP relative addressing below
3118   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
3119 
3120   // Set an oopmap for the call site
3121   // Use the same PC we used for the last java frame
3122   oop_maps->add_gc_map(the_pc - start,
3123                        new OopMap( frame_size_in_words, 0 ));
3124 
3125   // Clear fp AND pc
3126   __ reset_last_Java_frame(true);
3127 
3128   // Collect return values
3129   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
3130   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
3131   // I think this is useless (throwing pc?)
3132   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
3133 
3134   // Pop self-frame.
3135   __ leave();                           // Epilog
3136 
3137   // Jump to interpreter
3138   __ ret(0);
3139 
3140   // Make sure all code is generated
3141   masm->flush();
3142 
3143   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
3144   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
3145 #if INCLUDE_JVMCI
3146   if (EnableJVMCI) {
3147     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
3148     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
3149   }
3150 #endif
3151 
3152   AOTCodeCache::store_code_blob(*_deopt_blob, AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
3153 }
3154 
3155 //------------------------------generate_handler_blob------
3156 //
3157 // Generate a special Compile2Runtime blob that saves all registers,
3158 // and setup oopmap.
3159 //
3160 SafepointBlob* SharedRuntime::generate_handler_blob(StubId id, address call_ptr) {
3161   assert(StubRoutines::forward_exception_entry() != nullptr,
3162          "must be generated before");
3163   assert(is_polling_page_id(id), "expected a polling page stub id");
3164 
3165   // Allocate space for the code.  Setup code generation tools.
3166   const char* name = SharedRuntime::stub_name(id);
3167   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3168   if (blob != nullptr) {
3169     return blob->as_safepoint_blob();
3170   }
3171 
3172   ResourceMark rm;
3173   OopMapSet *oop_maps = new OopMapSet();
3174   OopMap* map;
3175   CodeBuffer buffer(name, 2548, 1024);
3176   MacroAssembler* masm = new MacroAssembler(&buffer);
3177 
3178   address start   = __ pc();
3179   address call_pc = nullptr;
3180   int frame_size_in_words;
3181   bool cause_return = (id == StubId::shared_polling_page_return_handler_id);
3182   bool save_wide_vectors = (id == StubId::shared_polling_page_vectors_safepoint_handler_id);
3183 
3184   // Make room for return address (or push it again)
3185   if (!cause_return) {
3186     __ push(rbx);
3187   }
3188 
3189   // Save registers, fpu state, and flags
3190   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3191 
3192   // The following is basically a call_VM.  However, we need the precise
3193   // address of the call in order to generate an oopmap. Hence, we do all the
3194   // work ourselves.
3195 
3196   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3197 
3198   // The return address must always be correct so that frame constructor never
3199   // sees an invalid pc.
3200 
3201   if (!cause_return) {
3202     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3203     // Additionally, rbx is a callee saved register and we can look at it later to determine
3204     // if someone changed the return address for us!
3205     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3206     __ movptr(Address(rbp, wordSize), rbx);
3207   }
3208 
3209   // Do the call
3210   __ mov(c_rarg0, r15_thread);
3211   __ call(RuntimeAddress(call_ptr));
3212 
3213   // Set an oopmap for the call site.  This oopmap will map all
3214   // oop-registers and debug-info registers as callee-saved.  This
3215   // will allow deoptimization at this safepoint to find all possible
3216   // debug-info recordings, as well as let GC find all oops.
3217 
3218   oop_maps->add_gc_map( __ pc() - start, map);
3219 
3220   Label noException;
3221 
3222   __ reset_last_Java_frame(false);
3223 
3224   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3225   __ jcc(Assembler::equal, noException);
3226 
3227   // Exception pending
3228 
3229   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3230 
3231   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3232 
3233   // No exception case
3234   __ bind(noException);
3235 
3236   Label no_adjust;
3237 #ifdef ASSERT
3238   Label bail;
3239 #endif
3240   if (!cause_return) {
3241     Label no_prefix, not_special, check_rex_prefix;
3242 
3243     // If our stashed return pc was modified by the runtime we avoid touching it
3244     __ cmpptr(rbx, Address(rbp, wordSize));
3245     __ jcc(Assembler::notEqual, no_adjust);
3246 
3247     // Skip over the poll instruction.
3248     // See NativeInstruction::is_safepoint_poll()
3249     // Possible encodings:
3250     //      85 00       test   %eax,(%rax)
3251     //      85 01       test   %eax,(%rcx)
3252     //      85 02       test   %eax,(%rdx)
3253     //      85 03       test   %eax,(%rbx)
3254     //      85 06       test   %eax,(%rsi)
3255     //      85 07       test   %eax,(%rdi)
3256     //
3257     //   41 85 00       test   %eax,(%r8)
3258     //   41 85 01       test   %eax,(%r9)
3259     //   41 85 02       test   %eax,(%r10)
3260     //   41 85 03       test   %eax,(%r11)
3261     //   41 85 06       test   %eax,(%r14)
3262     //   41 85 07       test   %eax,(%r15)
3263     //
3264     //      85 04 24    test   %eax,(%rsp)
3265     //   41 85 04 24    test   %eax,(%r12)
3266     //      85 45 00    test   %eax,0x0(%rbp)
3267     //   41 85 45 00    test   %eax,0x0(%r13)
3268     //
3269     // Notes:
3270     //  Format of legacy MAP0 test instruction:-
3271     //  [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32]
3272     //  o  For safepoint polling instruction "test %eax,(%rax)", encoding of first register
3273     //     operand and base register of memory operand is b/w [0-8), hence we do not require
3274     //     additional REX prefix where REX.B bit stores MSB bit of register encoding, which
3275     //     is why two bytes encoding is sufficient here.
3276     //  o  For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE
3277     //     register of memory operand is 1000, thus we need additional REX prefix in this case,
3278     //     there by adding additional byte to instruction encoding.
3279     //  o  In case BASE register is one of the 32 extended GPR registers available only on targets
3280     //     supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold
3281     //     most significant two bits of 5 bit register encoding.
3282 
3283     if (VM_Version::supports_apx_f()) {
3284       __ cmpb(Address(rbx, 0), Assembler::REX2);
3285       __ jccb(Assembler::notEqual, check_rex_prefix);
3286       __ addptr(rbx, 2);
3287       __ bind(check_rex_prefix);
3288     }
3289     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3290     __ jccb(Assembler::notEqual, no_prefix);
3291     __ addptr(rbx, 1);
3292     __ bind(no_prefix);
3293 #ifdef ASSERT
3294     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3295 #endif
3296     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3297     // r12/rsp 0x04
3298     // r13/rbp 0x05
3299     __ movzbq(rcx, Address(rbx, 1));
3300     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3301     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3302     __ cmpptr(rcx, 1);
3303     __ jccb(Assembler::above, not_special);
3304     __ addptr(rbx, 1);
3305     __ bind(not_special);
3306 #ifdef ASSERT
3307     // Verify the correct encoding of the poll we're about to skip.
3308     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3309     __ jcc(Assembler::notEqual, bail);
3310     // Mask out the modrm bits
3311     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3312     // rax encodes to 0, so if the bits are nonzero it's incorrect
3313     __ jcc(Assembler::notZero, bail);
3314 #endif
3315     // Adjust return pc forward to step over the safepoint poll instruction
3316     __ addptr(rbx, 2);
3317     __ movptr(Address(rbp, wordSize), rbx);
3318   }
3319 
3320   __ bind(no_adjust);
3321   // Normal exit, restore registers and exit.
3322   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3323   __ ret(0);
3324 
3325 #ifdef ASSERT
3326   __ bind(bail);
3327   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3328 #endif
3329 
3330   // Make sure all code is generated
3331   masm->flush();
3332 
3333   // Fill-out other meta info
3334   SafepointBlob* sp_blob = SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3335 
3336   AOTCodeCache::store_code_blob(*sp_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3337   return sp_blob;
3338 }
3339 
3340 //
3341 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3342 //
3343 // Generate a stub that calls into vm to find out the proper destination
3344 // of a java call. All the argument registers are live at this point
3345 // but since this is generic code we don't know what they are and the caller
3346 // must do any gc of the args.
3347 //
3348 RuntimeStub* SharedRuntime::generate_resolve_blob(StubId id, address destination) {
3349   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3350   assert(is_resolve_id(id), "expected a resolve stub id");
3351 
3352   const char* name = SharedRuntime::stub_name(id);
3353   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3354   if (blob != nullptr) {
3355     return blob->as_runtime_stub();
3356   }
3357 
3358   // allocate space for the code
3359   ResourceMark rm;
3360   CodeBuffer buffer(name, 1552, 512);
3361   MacroAssembler* masm = new MacroAssembler(&buffer);
3362 
3363   int frame_size_in_words;
3364 
3365   OopMapSet *oop_maps = new OopMapSet();
3366   OopMap* map = nullptr;
3367 
3368   int start = __ offset();
3369 
3370   // No need to save vector registers since they are caller-saved anyway.
3371   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3372 
3373   int frame_complete = __ offset();
3374 
3375   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3376 
3377   __ mov(c_rarg0, r15_thread);
3378 
3379   __ call(RuntimeAddress(destination));
3380 
3381 
3382   // Set an oopmap for the call site.
3383   // We need this not only for callee-saved registers, but also for volatile
3384   // registers that the compiler might be keeping live across a safepoint.
3385 
3386   oop_maps->add_gc_map( __ offset() - start, map);
3387 
3388   // rax contains the address we are going to jump to assuming no exception got installed
3389 
3390   // clear last_Java_sp
3391   __ reset_last_Java_frame(false);
3392   // check for pending exceptions
3393   Label pending;
3394   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3395   __ jcc(Assembler::notEqual, pending);
3396 
3397   // get the returned Method*
3398   __ get_vm_result_metadata(rbx);
3399   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3400 
3401   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3402 
3403   RegisterSaver::restore_live_registers(masm);
3404 
3405   // We are back to the original state on entry and ready to go.
3406 
3407   __ jmp(rax);
3408 
3409   // Pending exception after the safepoint
3410 
3411   __ bind(pending);
3412 
3413   RegisterSaver::restore_live_registers(masm);
3414 
3415   // exception pending => remove activation and forward to exception handler
3416 
3417   __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
3418 
3419   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3420   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3421 
3422   // -------------
3423   // make sure all code is generated
3424   masm->flush();
3425 
3426   // return the  blob
3427   // frame_size_words or bytes??
3428   RuntimeStub* rs_blob = RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3429 
3430   AOTCodeCache::store_code_blob(*rs_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3431   return rs_blob;
3432 }
3433 
3434 // Continuation point for throwing of implicit exceptions that are
3435 // not handled in the current activation. Fabricates an exception
3436 // oop and initiates normal exception dispatching in this
3437 // frame. Since we need to preserve callee-saved values (currently
3438 // only for C2, but done for C1 as well) we need a callee-saved oop
3439 // map and therefore have to make these stubs into RuntimeStubs
3440 // rather than BufferBlobs.  If the compiler needs all registers to
3441 // be preserved between the fault point and the exception handler
3442 // then it must assume responsibility for that in
3443 // AbstractCompiler::continuation_for_implicit_null_exception or
3444 // continuation_for_implicit_division_by_zero_exception. All other
3445 // implicit exceptions (e.g., NullPointerException or
3446 // AbstractMethodError on entry) are either at call sites or
3447 // otherwise assume that stack unwinding will be initiated, so
3448 // caller saved registers were assumed volatile in the compiler.
3449 RuntimeStub* SharedRuntime::generate_throw_exception(StubId id, address runtime_entry) {
3450   assert(is_throw_id(id), "expected a throw stub id");
3451 
3452   const char* name = SharedRuntime::stub_name(id);
3453 
3454   // Information about frame layout at time of blocking runtime call.
3455   // Note that we only have to preserve callee-saved registers since
3456   // the compilers are responsible for supplying a continuation point
3457   // if they expect all registers to be preserved.
3458   enum layout {
3459     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3460     rbp_off2,
3461     return_off,
3462     return_off2,
3463     framesize // inclusive of return address
3464   };
3465 
3466   int insts_size = 512;
3467   int locs_size  = 64;
3468 
3469   const char* timer_msg = "SharedRuntime generate_throw_exception";
3470   TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3471 
3472   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3473   if (blob != nullptr) {
3474     return blob->as_runtime_stub();
3475   }
3476 
3477   ResourceMark rm;
3478   CodeBuffer code(name, insts_size, locs_size);
3479   OopMapSet* oop_maps  = new OopMapSet();
3480   MacroAssembler* masm = new MacroAssembler(&code);
3481 
3482   address start = __ pc();
3483 
3484   // This is an inlined and slightly modified version of call_VM
3485   // which has the ability to fetch the return PC out of
3486   // thread-local storage and also sets up last_Java_sp slightly
3487   // differently than the real call_VM
3488 
3489   __ enter(); // required for proper stackwalking of RuntimeStub frame
3490 
3491   assert(is_even(framesize/2), "sp not 16-byte aligned");
3492 
3493   // return address and rbp are already in place
3494   __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3495 
3496   int frame_complete = __ pc() - start;
3497 
3498   // Set up last_Java_sp and last_Java_fp
3499   address the_pc = __ pc();
3500   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3501   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3502 
3503   // Call runtime
3504   __ movptr(c_rarg0, r15_thread);
3505   BLOCK_COMMENT("call runtime_entry");
3506   __ call(RuntimeAddress(runtime_entry));
3507 
3508   // Generate oop map
3509   OopMap* map = new OopMap(framesize, 0);
3510 
3511   oop_maps->add_gc_map(the_pc - start, map);
3512 
3513   __ reset_last_Java_frame(true);
3514 
3515   __ leave(); // required for proper stackwalking of RuntimeStub frame
3516 
3517   // check for pending exceptions
3518 #ifdef ASSERT
3519   Label L;
3520   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3521   __ jcc(Assembler::notEqual, L);
3522   __ should_not_reach_here();
3523   __ bind(L);
3524 #endif // ASSERT
3525   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3526 
3527 
3528   // codeBlob framesize is in words (not VMRegImpl::slot_size)
3529   RuntimeStub* stub =
3530     RuntimeStub::new_runtime_stub(name,
3531                                   &code,
3532                                   frame_complete,
3533                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3534                                   oop_maps, false);
3535   AOTCodeCache::store_code_blob(*stub, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3536 
3537   return stub;
3538 }
3539 
3540 //------------------------------Montgomery multiplication------------------------
3541 //
3542 
3543 #ifndef _WINDOWS
3544 
3545 // Subtract 0:b from carry:a.  Return carry.
3546 static julong
3547 sub(julong a[], julong b[], julong carry, long len) {
3548   long long i = 0, cnt = len;
3549   julong tmp;
3550   asm volatile("clc; "
3551                "0: ; "
3552                "mov (%[b], %[i], 8), %[tmp]; "
3553                "sbb %[tmp], (%[a], %[i], 8); "
3554                "inc %[i]; dec %[cnt]; "
3555                "jne 0b; "
3556                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3557                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3558                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3559                : "memory");
3560   return tmp;
3561 }
3562 
3563 // Multiply (unsigned) Long A by Long B, accumulating the double-
3564 // length result into the accumulator formed of T0, T1, and T2.
3565 #define MACC(A, B, T0, T1, T2)                                  \
3566 do {                                                            \
3567   unsigned long hi, lo;                                         \
3568   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3569            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3570            : "r"(A), "a"(B) : "cc");                            \
3571  } while(0)
3572 
3573 // As above, but add twice the double-length result into the
3574 // accumulator.
3575 #define MACC2(A, B, T0, T1, T2)                                 \
3576 do {                                                            \
3577   unsigned long hi, lo;                                         \
3578   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3579            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3580            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3581            : "r"(A), "a"(B) : "cc");                            \
3582  } while(0)
3583 
3584 #else //_WINDOWS
3585 
3586 static julong
3587 sub(julong a[], julong b[], julong carry, long len) {
3588   long i;
3589   julong tmp;
3590   unsigned char c = 1;
3591   for (i = 0; i < len; i++) {
3592     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3593     a[i] = tmp;
3594   }
3595   c = _addcarry_u64(c, carry, ~0, &tmp);
3596   return tmp;
3597 }
3598 
3599 // Multiply (unsigned) Long A by Long B, accumulating the double-
3600 // length result into the accumulator formed of T0, T1, and T2.
3601 #define MACC(A, B, T0, T1, T2)                          \
3602 do {                                                    \
3603   julong hi, lo;                            \
3604   lo = _umul128(A, B, &hi);                             \
3605   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3606   c = _addcarry_u64(c, hi, T1, &T1);                    \
3607   _addcarry_u64(c, T2, 0, &T2);                         \
3608  } while(0)
3609 
3610 // As above, but add twice the double-length result into the
3611 // accumulator.
3612 #define MACC2(A, B, T0, T1, T2)                         \
3613 do {                                                    \
3614   julong hi, lo;                            \
3615   lo = _umul128(A, B, &hi);                             \
3616   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3617   c = _addcarry_u64(c, hi, T1, &T1);                    \
3618   _addcarry_u64(c, T2, 0, &T2);                         \
3619   c = _addcarry_u64(0, lo, T0, &T0);                    \
3620   c = _addcarry_u64(c, hi, T1, &T1);                    \
3621   _addcarry_u64(c, T2, 0, &T2);                         \
3622  } while(0)
3623 
3624 #endif //_WINDOWS
3625 
3626 // Fast Montgomery multiplication.  The derivation of the algorithm is
3627 // in  A Cryptographic Library for the Motorola DSP56000,
3628 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3629 
3630 static void NOINLINE
3631 montgomery_multiply(julong a[], julong b[], julong n[],
3632                     julong m[], julong inv, int len) {
3633   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3634   int i;
3635 
3636   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3637 
3638   for (i = 0; i < len; i++) {
3639     int j;
3640     for (j = 0; j < i; j++) {
3641       MACC(a[j], b[i-j], t0, t1, t2);
3642       MACC(m[j], n[i-j], t0, t1, t2);
3643     }
3644     MACC(a[i], b[0], t0, t1, t2);
3645     m[i] = t0 * inv;
3646     MACC(m[i], n[0], t0, t1, t2);
3647 
3648     assert(t0 == 0, "broken Montgomery multiply");
3649 
3650     t0 = t1; t1 = t2; t2 = 0;
3651   }
3652 
3653   for (i = len; i < 2*len; i++) {
3654     int j;
3655     for (j = i-len+1; j < len; j++) {
3656       MACC(a[j], b[i-j], t0, t1, t2);
3657       MACC(m[j], n[i-j], t0, t1, t2);
3658     }
3659     m[i-len] = t0;
3660     t0 = t1; t1 = t2; t2 = 0;
3661   }
3662 
3663   while (t0)
3664     t0 = sub(m, n, t0, len);
3665 }
3666 
3667 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3668 // multiplies so it should be up to 25% faster than Montgomery
3669 // multiplication.  However, its loop control is more complex and it
3670 // may actually run slower on some machines.
3671 
3672 static void NOINLINE
3673 montgomery_square(julong a[], julong n[],
3674                   julong m[], julong inv, int len) {
3675   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3676   int i;
3677 
3678   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3679 
3680   for (i = 0; i < len; i++) {
3681     int j;
3682     int end = (i+1)/2;
3683     for (j = 0; j < end; j++) {
3684       MACC2(a[j], a[i-j], t0, t1, t2);
3685       MACC(m[j], n[i-j], t0, t1, t2);
3686     }
3687     if ((i & 1) == 0) {
3688       MACC(a[j], a[j], t0, t1, t2);
3689     }
3690     for (; j < i; j++) {
3691       MACC(m[j], n[i-j], t0, t1, t2);
3692     }
3693     m[i] = t0 * inv;
3694     MACC(m[i], n[0], t0, t1, t2);
3695 
3696     assert(t0 == 0, "broken Montgomery square");
3697 
3698     t0 = t1; t1 = t2; t2 = 0;
3699   }
3700 
3701   for (i = len; i < 2*len; i++) {
3702     int start = i-len+1;
3703     int end = start + (len - start)/2;
3704     int j;
3705     for (j = start; j < end; j++) {
3706       MACC2(a[j], a[i-j], t0, t1, t2);
3707       MACC(m[j], n[i-j], t0, t1, t2);
3708     }
3709     if ((i & 1) == 0) {
3710       MACC(a[j], a[j], t0, t1, t2);
3711     }
3712     for (; j < len; j++) {
3713       MACC(m[j], n[i-j], t0, t1, t2);
3714     }
3715     m[i-len] = t0;
3716     t0 = t1; t1 = t2; t2 = 0;
3717   }
3718 
3719   while (t0)
3720     t0 = sub(m, n, t0, len);
3721 }
3722 
3723 // Swap words in a longword.
3724 static julong swap(julong x) {
3725   return (x << 32) | (x >> 32);
3726 }
3727 
3728 // Copy len longwords from s to d, word-swapping as we go.  The
3729 // destination array is reversed.
3730 static void reverse_words(julong *s, julong *d, int len) {
3731   d += len;
3732   while(len-- > 0) {
3733     d--;
3734     *d = swap(*s);
3735     s++;
3736   }
3737 }
3738 
3739 // The threshold at which squaring is advantageous was determined
3740 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3741 #define MONTGOMERY_SQUARING_THRESHOLD 64
3742 
3743 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3744                                         jint len, jlong inv,
3745                                         jint *m_ints) {
3746   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3747   int longwords = len/2;
3748 
3749   // Make very sure we don't use so much space that the stack might
3750   // overflow.  512 jints corresponds to an 16384-bit integer and
3751   // will use here a total of 8k bytes of stack space.
3752   int divisor = sizeof(julong) * 4;
3753   guarantee(longwords <= 8192 / divisor, "must be");
3754   int total_allocation = longwords * sizeof (julong) * 4;
3755   julong *scratch = (julong *)alloca(total_allocation);
3756 
3757   // Local scratch arrays
3758   julong
3759     *a = scratch + 0 * longwords,
3760     *b = scratch + 1 * longwords,
3761     *n = scratch + 2 * longwords,
3762     *m = scratch + 3 * longwords;
3763 
3764   reverse_words((julong *)a_ints, a, longwords);
3765   reverse_words((julong *)b_ints, b, longwords);
3766   reverse_words((julong *)n_ints, n, longwords);
3767 
3768   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3769 
3770   reverse_words(m, (julong *)m_ints, longwords);
3771 }
3772 
3773 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3774                                       jint len, jlong inv,
3775                                       jint *m_ints) {
3776   assert(len % 2 == 0, "array length in montgomery_square must be even");
3777   int longwords = len/2;
3778 
3779   // Make very sure we don't use so much space that the stack might
3780   // overflow.  512 jints corresponds to an 16384-bit integer and
3781   // will use here a total of 6k bytes of stack space.
3782   int divisor = sizeof(julong) * 3;
3783   guarantee(longwords <= (8192 / divisor), "must be");
3784   int total_allocation = longwords * sizeof (julong) * 3;
3785   julong *scratch = (julong *)alloca(total_allocation);
3786 
3787   // Local scratch arrays
3788   julong
3789     *a = scratch + 0 * longwords,
3790     *n = scratch + 1 * longwords,
3791     *m = scratch + 2 * longwords;
3792 
3793   reverse_words((julong *)a_ints, a, longwords);
3794   reverse_words((julong *)n_ints, n, longwords);
3795 
3796   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3797     ::montgomery_square(a, n, m, (julong)inv, longwords);
3798   } else {
3799     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3800   }
3801 
3802   reverse_words(m, (julong *)m_ints, longwords);
3803 }
3804 
3805 BufferedInlineTypeBlob* SharedRuntime::generate_buffered_inline_type_adapter(const InlineKlass* vk) {
3806   BufferBlob* buf = BufferBlob::create("inline types pack/unpack", 16 * K);
3807   if (buf == nullptr) {
3808     return nullptr;
3809   }
3810   CodeBuffer buffer(buf);
3811   short buffer_locs[20];
3812   buffer.insts()->initialize_shared_locs((relocInfo*)buffer_locs,
3813                                          sizeof(buffer_locs)/sizeof(relocInfo));
3814 
3815   MacroAssembler* masm = new MacroAssembler(&buffer);
3816 
3817   const Array<SigEntry>* sig_vk = vk->extended_sig();
3818   const Array<VMRegPair>* regs = vk->return_regs();
3819 
3820   int pack_fields_jobject_off = __ offset();
3821   // Resolve pre-allocated buffer from JNI handle.
3822   // We cannot do this in generate_call_stub() because it requires GC code to be initialized.
3823   __ movptr(rax, Address(r13, 0));
3824   __ resolve_jobject(rax /* value */,
3825                      r12 /* tmp */);
3826   __ movptr(Address(r13, 0), rax);
3827 
3828   int pack_fields_off = __ offset();
3829 
3830   int j = 1;
3831   for (int i = 0; i < sig_vk->length(); i++) {
3832     BasicType bt = sig_vk->at(i)._bt;
3833     if (bt == T_METADATA) {
3834       continue;
3835     }
3836     if (bt == T_VOID) {
3837       if (sig_vk->at(i-1)._bt == T_LONG ||
3838           sig_vk->at(i-1)._bt == T_DOUBLE) {
3839         j++;
3840       }
3841       continue;
3842     }
3843     int off = sig_vk->at(i)._offset;
3844     assert(off > 0, "offset in object should be positive");
3845     VMRegPair pair = regs->at(j);
3846     VMReg r_1 = pair.first();
3847     Address to(rax, off);
3848     if (bt == T_FLOAT) {
3849       __ movflt(to, r_1->as_XMMRegister());
3850     } else if (bt == T_DOUBLE) {
3851       __ movdbl(to, r_1->as_XMMRegister());
3852     } else {
3853       Register val = r_1->as_Register();
3854       assert_different_registers(to.base(), val, r14, r13, rbx, rscratch1);
3855       if (is_reference_type(bt)) {
3856         // store_heap_oop transitively calls oop_store_at which corrupts to.base(). We need to keep rax valid.
3857         __ mov(rbx, rax);
3858         Address to_with_rbx(rbx, off);
3859         __ store_heap_oop(to_with_rbx, val, r14, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
3860       } else {
3861         __ store_sized_value(to, r_1->as_Register(), type2aelembytes(bt));
3862       }
3863     }
3864     j++;
3865   }
3866   assert(j == regs->length(), "missed a field?");
3867   if (vk->supports_nullable_layouts()) {
3868     // Set the null marker
3869     __ movb(Address(rax, vk->null_marker_offset()), 1);
3870   }
3871   __ ret(0);
3872 
3873   int unpack_fields_off = __ offset();
3874 
3875   Label skip;
3876   Label not_null;
3877   __ testptr(rax, rax);
3878   __ jcc(Assembler::notZero, not_null);
3879 
3880   // Return value is null. Zero all registers because the runtime requires a canonical
3881   // representation of a flat null.
3882   j = 1;
3883   for (int i = 0; i < sig_vk->length(); i++) {
3884     BasicType bt = sig_vk->at(i)._bt;
3885     if (bt == T_METADATA) {
3886       continue;
3887     }
3888     if (bt == T_VOID) {
3889       if (sig_vk->at(i-1)._bt == T_LONG ||
3890           sig_vk->at(i-1)._bt == T_DOUBLE) {
3891         j++;
3892       }
3893       continue;
3894     }
3895 
3896     VMRegPair pair = regs->at(j);
3897     VMReg r_1 = pair.first();
3898     if (r_1->is_XMMRegister()) {
3899       __ xorps(r_1->as_XMMRegister(), r_1->as_XMMRegister());
3900     } else {
3901       __ xorl(r_1->as_Register(), r_1->as_Register());
3902     }
3903     j++;
3904   }
3905   __ jmp(skip);
3906   __ bind(not_null);
3907 
3908   j = 1;
3909   for (int i = 0; i < sig_vk->length(); i++) {
3910     BasicType bt = sig_vk->at(i)._bt;
3911     if (bt == T_METADATA) {
3912       continue;
3913     }
3914     if (bt == T_VOID) {
3915       if (sig_vk->at(i-1)._bt == T_LONG ||
3916           sig_vk->at(i-1)._bt == T_DOUBLE) {
3917         j++;
3918       }
3919       continue;
3920     }
3921     int off = sig_vk->at(i)._offset;
3922     assert(off > 0, "offset in object should be positive");
3923     VMRegPair pair = regs->at(j);
3924     VMReg r_1 = pair.first();
3925     VMReg r_2 = pair.second();
3926     Address from(rax, off);
3927     if (bt == T_FLOAT) {
3928       __ movflt(r_1->as_XMMRegister(), from);
3929     } else if (bt == T_DOUBLE) {
3930       __ movdbl(r_1->as_XMMRegister(), from);
3931     } else if (bt == T_OBJECT || bt == T_ARRAY) {
3932       assert_different_registers(rax, r_1->as_Register());
3933       __ load_heap_oop(r_1->as_Register(), from);
3934     } else {
3935       assert(is_java_primitive(bt), "unexpected basic type");
3936       assert_different_registers(rax, r_1->as_Register());
3937       size_t size_in_bytes = type2aelembytes(bt);
3938       __ load_sized_value(r_1->as_Register(), from, size_in_bytes, bt != T_CHAR && bt != T_BOOLEAN);
3939     }
3940     j++;
3941   }
3942   assert(j == regs->length(), "missed a field?");
3943 
3944   __ bind(skip);
3945   __ ret(0);
3946 
3947   __ flush();
3948 
3949   return BufferedInlineTypeBlob::create(&buffer, pack_fields_off, pack_fields_jobject_off, unpack_fields_off);
3950 }
3951 
3952 #if INCLUDE_JFR
3953 
3954 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
3955 // It returns a jobject handle to the event writer.
3956 // The handle is dereferenced and the return value is the event writer oop.
3957 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
3958   enum layout {
3959     rbp_off,
3960     rbpH_off,
3961     return_off,
3962     return_off2,
3963     framesize // inclusive of return address
3964   };
3965 
3966   const char* name = SharedRuntime::stub_name(StubId::shared_jfr_write_checkpoint_id);
3967   CodeBuffer code(name, 1024, 64);
3968   MacroAssembler* masm = new MacroAssembler(&code);
3969   address start = __ pc();
3970 
3971   __ enter();
3972   address the_pc = __ pc();
3973 
3974   int frame_complete = the_pc - start;
3975 
3976   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3977   __ movptr(c_rarg0, r15_thread);
3978   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
3979   __ reset_last_Java_frame(true);
3980 
3981   // rax is jobject handle result, unpack and process it through a barrier.
3982   __ resolve_global_jobject(rax, c_rarg0);
3983 
3984   __ leave();
3985   __ ret(0);
3986 
3987   OopMapSet* oop_maps = new OopMapSet();
3988   OopMap* map = new OopMap(framesize, 1);
3989   oop_maps->add_gc_map(frame_complete, map);
3990 
3991   RuntimeStub* stub =
3992     RuntimeStub::new_runtime_stub(name,
3993                                   &code,
3994                                   frame_complete,
3995                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3996                                   oop_maps,
3997                                   false);
3998   return stub;
3999 }
4000 
4001 // For c2: call to return a leased buffer.
4002 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
4003   enum layout {
4004     rbp_off,
4005     rbpH_off,
4006     return_off,
4007     return_off2,
4008     framesize // inclusive of return address
4009   };
4010 
4011   const char* name = SharedRuntime::stub_name(StubId::shared_jfr_return_lease_id);
4012   CodeBuffer code(name, 1024, 64);
4013   MacroAssembler* masm = new MacroAssembler(&code);
4014   address start = __ pc();
4015 
4016   __ enter();
4017   address the_pc = __ pc();
4018 
4019   int frame_complete = the_pc - start;
4020 
4021   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
4022   __ movptr(c_rarg0, r15_thread);
4023   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
4024   __ reset_last_Java_frame(true);
4025 
4026   __ leave();
4027   __ ret(0);
4028 
4029   OopMapSet* oop_maps = new OopMapSet();
4030   OopMap* map = new OopMap(framesize, 1);
4031   oop_maps->add_gc_map(frame_complete, map);
4032 
4033   RuntimeStub* stub =
4034     RuntimeStub::new_runtime_stub(name,
4035                                   &code,
4036                                   frame_complete,
4037                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4038                                   oop_maps,
4039                                   false);
4040   return stub;
4041 }
4042 
4043 #endif // INCLUDE_JFR