1 /*
   2  * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef _WINDOWS
  26 #include "alloca.h"
  27 #endif
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "classfile/symbolTable.hpp"
  31 #include "code/aotCodeCache.hpp"
  32 #include "code/compiledIC.hpp"
  33 #include "code/debugInfoRec.hpp"
  34 #include "code/nativeInst.hpp"
  35 #include "code/vtableStubs.hpp"
  36 #include "compiler/oopMap.hpp"
  37 #include "gc/shared/collectedHeap.hpp"
  38 #include "gc/shared/gcLocker.hpp"
  39 #include "gc/shared/barrierSet.hpp"
  40 #include "gc/shared/barrierSetAssembler.hpp"
  41 #include "interpreter/interpreter.hpp"
  42 #include "logging/log.hpp"
  43 #include "memory/resourceArea.hpp"
  44 #include "memory/universe.hpp"
  45 #include "oops/klass.inline.hpp"
  46 #include "oops/method.inline.hpp"
  47 #include "prims/methodHandles.hpp"
  48 #include "runtime/continuation.hpp"
  49 #include "runtime/continuationEntry.inline.hpp"
  50 #include "runtime/globals.hpp"
  51 #include "runtime/jniHandles.hpp"
  52 #include "runtime/safepointMechanism.hpp"
  53 #include "runtime/sharedRuntime.hpp"
  54 #include "runtime/signature.hpp"
  55 #include "runtime/stubRoutines.hpp"
  56 #include "runtime/timerTrace.hpp"
  57 #include "runtime/vframeArray.hpp"
  58 #include "runtime/vm_version.hpp"
  59 #include "utilities/align.hpp"
  60 #include "utilities/checkedCast.hpp"
  61 #include "utilities/formatBuffer.hpp"
  62 #include "vmreg_x86.inline.hpp"
  63 #ifdef COMPILER1
  64 #include "c1/c1_Runtime1.hpp"
  65 #endif
  66 #ifdef COMPILER2
  67 #include "opto/runtime.hpp"
  68 #endif
  69 #if INCLUDE_JVMCI
  70 #include "jvmci/jvmciJavaClasses.hpp"
  71 #endif
  72 
  73 #define __ masm->
  74 
  75 #ifdef PRODUCT
  76 #define BLOCK_COMMENT(str) /* nothing */
  77 #else
  78 #define BLOCK_COMMENT(str) __ block_comment(str)
  79 #endif // PRODUCT
  80 
  81 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  82 
  83 class RegisterSaver {
  84   // Capture info about frame layout.  Layout offsets are in jint
  85   // units because compiler frame slots are jints.
  86 #define XSAVE_AREA_BEGIN 160
  87 #define XSAVE_AREA_YMM_BEGIN 576
  88 #define XSAVE_AREA_EGPRS 960
  89 #define XSAVE_AREA_OPMASK_BEGIN 1088
  90 #define XSAVE_AREA_ZMM_BEGIN 1152
  91 #define XSAVE_AREA_UPPERBANK 1664
  92 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  93 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  94 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
  95 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
  96 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
  97   enum layout {
  98     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt,    // fxsave save area
  99     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,  // offset in fxsave save area
 100     DEF_XMM_OFFS(0),
 101     DEF_XMM_OFFS(1),
 102     // 2..15 are implied in range usage
 103     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 104     DEF_YMM_OFFS(0),
 105     DEF_YMM_OFFS(1),
 106     r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
 107     r16H_off,
 108     r17_off, r17H_off,
 109     r18_off, r18H_off,
 110     r19_off, r19H_off,
 111     r20_off, r20H_off,
 112     r21_off, r21H_off,
 113     r22_off, r22H_off,
 114     r23_off, r23H_off,
 115     r24_off, r24H_off,
 116     r25_off, r25H_off,
 117     r26_off, r26H_off,
 118     r27_off, r27H_off,
 119     r28_off, r28H_off,
 120     r29_off, r29H_off,
 121     r30_off, r30H_off,
 122     r31_off, r31H_off,
 123     opmask_off   = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 124     DEF_OPMASK_OFFS(0),
 125     DEF_OPMASK_OFFS(1),
 126     // 2..7 are implied in range usage
 127     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 128     DEF_ZMM_OFFS(0),
 129     DEF_ZMM_OFFS(1),
 130     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 131     DEF_ZMM_UPPER_OFFS(16),
 132     DEF_ZMM_UPPER_OFFS(17),
 133     // 18..31 are implied in range usage
 134     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 135     fpu_stateH_end,
 136     r15_off, r15H_off,
 137     r14_off, r14H_off,
 138     r13_off, r13H_off,
 139     r12_off, r12H_off,
 140     r11_off, r11H_off,
 141     r10_off, r10H_off,
 142     r9_off,  r9H_off,
 143     r8_off,  r8H_off,
 144     rdi_off, rdiH_off,
 145     rsi_off, rsiH_off,
 146     ignore_off, ignoreH_off,  // extra copy of rbp
 147     rsp_off, rspH_off,
 148     rbx_off, rbxH_off,
 149     rdx_off, rdxH_off,
 150     rcx_off, rcxH_off,
 151     rax_off, raxH_off,
 152     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 153     align_off, alignH_off,
 154     flags_off, flagsH_off,
 155     // The frame sender code expects that rbp will be in the "natural" place and
 156     // will override any oopMap setting for it. We must therefore force the layout
 157     // so that it agrees with the frame sender code.
 158     rbp_off, rbpH_off,        // copy of rbp we will restore
 159     return_off, returnH_off,  // slot for return address
 160     reg_save_size             // size in compiler stack slots
 161   };
 162 
 163  public:
 164   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 165   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 166 
 167   // Offsets into the register save area
 168   // Used by deoptimization when it is managing result register
 169   // values on its own
 170 
 171   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 172   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 173   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 174   static int r15_offset_in_bytes(void)    { return BytesPerInt * r15_off; }
 175   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 176   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 177 
 178   // During deoptimization only the result registers need to be restored,
 179   // all the other values have already been extracted.
 180   static void restore_result_registers(MacroAssembler* masm);
 181 };
 182 
 183 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 184   int off = 0;
 185   int num_xmm_regs = XMMRegister::available_xmm_registers();
 186 #if COMPILER2_OR_JVMCI
 187   if (save_wide_vectors && UseAVX == 0) {
 188     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 189   }
 190   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 191 #else
 192   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 193 #endif
 194 
 195   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 196   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 197   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 198   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 199   // CodeBlob frame size is in words.
 200   int frame_size_in_words = frame_size_in_bytes / wordSize;
 201   *total_frame_words = frame_size_in_words;
 202 
 203   // Save registers, fpu state, and flags.
 204   // We assume caller has already pushed the return address onto the
 205   // stack, so rsp is 8-byte aligned here.
 206   // We push rpb twice in this sequence because we want the real rbp
 207   // to be under the return like a normal enter.
 208 
 209   __ enter();          // rsp becomes 16-byte aligned here
 210   __ pushf();
 211   // Make sure rsp stays 16-byte aligned
 212   __ subq(rsp, 8);
 213   // Push CPU state in multiple of 16 bytes
 214   __ save_legacy_gprs();
 215   __ push_FPU_state();
 216 
 217 
 218   // push cpu state handles this on EVEX enabled targets
 219   if (save_wide_vectors) {
 220     // Save upper half of YMM registers(0..15)
 221     int base_addr = XSAVE_AREA_YMM_BEGIN;
 222     for (int n = 0; n < 16; n++) {
 223       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 224     }
 225     if (VM_Version::supports_evex()) {
 226       // Save upper half of ZMM registers(0..15)
 227       base_addr = XSAVE_AREA_ZMM_BEGIN;
 228       for (int n = 0; n < 16; n++) {
 229         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 230       }
 231       // Save full ZMM registers(16..num_xmm_regs)
 232       base_addr = XSAVE_AREA_UPPERBANK;
 233       off = 0;
 234       int vector_len = Assembler::AVX_512bit;
 235       for (int n = 16; n < num_xmm_regs; n++) {
 236         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 237       }
 238 #if COMPILER2_OR_JVMCI
 239       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 240       off = 0;
 241       for(int n = 0; n < KRegister::number_of_registers; n++) {
 242         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 243       }
 244 #endif
 245     }
 246   } else {
 247     if (VM_Version::supports_evex()) {
 248       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 249       int base_addr = XSAVE_AREA_UPPERBANK;
 250       off = 0;
 251       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 252       for (int n = 16; n < num_xmm_regs; n++) {
 253         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 254       }
 255 #if COMPILER2_OR_JVMCI
 256       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 257       off = 0;
 258       for(int n = 0; n < KRegister::number_of_registers; n++) {
 259         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 260       }
 261 #endif
 262     }
 263   }
 264 
 265 #if COMPILER2_OR_JVMCI
 266   if (UseAPX) {
 267       int base_addr = XSAVE_AREA_EGPRS;
 268       off = 0;
 269       for (int n = 16; n < Register::number_of_registers; n++) {
 270         __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
 271       }
 272   }
 273 #endif
 274 
 275   __ vzeroupper();
 276   if (frame::arg_reg_save_area_bytes != 0) {
 277     // Allocate argument register save area
 278     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 279   }
 280 
 281   // Set an oopmap for the call site.  This oopmap will map all
 282   // oop-registers and debug-info registers as callee-saved.  This
 283   // will allow deoptimization at this safepoint to find all possible
 284   // debug-info recordings, as well as let GC find all oops.
 285 
 286   OopMapSet *oop_maps = new OopMapSet();
 287   OopMap* map = new OopMap(frame_size_in_slots, 0);
 288 
 289 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 290 
 291   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 292   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 293   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 294   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 295   // rbp location is known implicitly by the frame sender code, needs no oopmap
 296   // and the location where rbp was saved by is ignored
 297   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 298   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 299   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 300   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 301   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 302   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 303   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 304   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 305   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 306   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 307 
 308   if (UseAPX) {
 309     map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
 310     map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
 311     map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
 312     map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
 313     map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
 314     map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
 315     map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
 316     map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
 317     map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
 318     map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
 319     map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
 320     map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
 321     map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
 322     map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
 323     map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
 324     map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
 325   }
 326   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 327   // on EVEX enabled targets, we get it included in the xsave area
 328   off = xmm0_off;
 329   int delta = xmm1_off - off;
 330   for (int n = 0; n < 16; n++) {
 331     XMMRegister xmm_name = as_XMMRegister(n);
 332     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 333     off += delta;
 334   }
 335   if (UseAVX > 2) {
 336     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 337     off = zmm16_off;
 338     delta = zmm17_off - off;
 339     for (int n = 16; n < num_xmm_regs; n++) {
 340       XMMRegister zmm_name = as_XMMRegister(n);
 341       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 342       off += delta;
 343     }
 344   }
 345 
 346 #if COMPILER2_OR_JVMCI
 347   if (save_wide_vectors) {
 348     // Save upper half of YMM registers(0..15)
 349     off = ymm0_off;
 350     delta = ymm1_off - ymm0_off;
 351     for (int n = 0; n < 16; n++) {
 352       XMMRegister ymm_name = as_XMMRegister(n);
 353       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 354       off += delta;
 355     }
 356     if (VM_Version::supports_evex()) {
 357       // Save upper half of ZMM registers(0..15)
 358       off = zmm0_off;
 359       delta = zmm1_off - zmm0_off;
 360       for (int n = 0; n < 16; n++) {
 361         XMMRegister zmm_name = as_XMMRegister(n);
 362         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 363         off += delta;
 364       }
 365     }
 366   }
 367 #endif // COMPILER2_OR_JVMCI
 368 
 369   // %%% These should all be a waste but we'll keep things as they were for now
 370   if (true) {
 371     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 372     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 373     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 374     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 375     // rbp location is known implicitly by the frame sender code, needs no oopmap
 376     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 377     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 378     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 379     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 380     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 381     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 382     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 383     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 384     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 385     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 386     if (UseAPX) {
 387       map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
 388       map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
 389       map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
 390       map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
 391       map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
 392       map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
 393       map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
 394       map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
 395       map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
 396       map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
 397       map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
 398       map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
 399       map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
 400       map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
 401       map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
 402       map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
 403     }
 404     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 405     // on EVEX enabled targets, we get it included in the xsave area
 406     off = xmm0H_off;
 407     delta = xmm1H_off - off;
 408     for (int n = 0; n < 16; n++) {
 409       XMMRegister xmm_name = as_XMMRegister(n);
 410       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 411       off += delta;
 412     }
 413     if (UseAVX > 2) {
 414       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 415       off = zmm16H_off;
 416       delta = zmm17H_off - off;
 417       for (int n = 16; n < num_xmm_regs; n++) {
 418         XMMRegister zmm_name = as_XMMRegister(n);
 419         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 420         off += delta;
 421       }
 422     }
 423   }
 424 
 425   return map;
 426 }
 427 
 428 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 429   int num_xmm_regs = XMMRegister::available_xmm_registers();
 430   if (frame::arg_reg_save_area_bytes != 0) {
 431     // Pop arg register save area
 432     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 433   }
 434 
 435 #if COMPILER2_OR_JVMCI
 436   if (restore_wide_vectors) {
 437     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 438     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 439   }
 440 #else
 441   assert(!restore_wide_vectors, "vectors are generated only by C2");
 442 #endif
 443 
 444   __ vzeroupper();
 445 
 446   // On EVEX enabled targets everything is handled in pop fpu state
 447   if (restore_wide_vectors) {
 448     // Restore upper half of YMM registers (0..15)
 449     int base_addr = XSAVE_AREA_YMM_BEGIN;
 450     for (int n = 0; n < 16; n++) {
 451       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 452     }
 453     if (VM_Version::supports_evex()) {
 454       // Restore upper half of ZMM registers (0..15)
 455       base_addr = XSAVE_AREA_ZMM_BEGIN;
 456       for (int n = 0; n < 16; n++) {
 457         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 458       }
 459       // Restore full ZMM registers(16..num_xmm_regs)
 460       base_addr = XSAVE_AREA_UPPERBANK;
 461       int vector_len = Assembler::AVX_512bit;
 462       int off = 0;
 463       for (int n = 16; n < num_xmm_regs; n++) {
 464         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 465       }
 466 #if COMPILER2_OR_JVMCI
 467       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 468       off = 0;
 469       for (int n = 0; n < KRegister::number_of_registers; n++) {
 470         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 471       }
 472 #endif
 473     }
 474   } else {
 475     if (VM_Version::supports_evex()) {
 476       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 477       int base_addr = XSAVE_AREA_UPPERBANK;
 478       int off = 0;
 479       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 480       for (int n = 16; n < num_xmm_regs; n++) {
 481         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 482       }
 483 #if COMPILER2_OR_JVMCI
 484       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 485       off = 0;
 486       for (int n = 0; n < KRegister::number_of_registers; n++) {
 487         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 488       }
 489 #endif
 490     }
 491   }
 492 
 493 #if COMPILER2_OR_JVMCI
 494   if (UseAPX) {
 495     int base_addr = XSAVE_AREA_EGPRS;
 496     int off = 0;
 497     for (int n = 16; n < Register::number_of_registers; n++) {
 498       __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
 499     }
 500   }
 501 #endif
 502 
 503   // Recover CPU state
 504   __ pop_FPU_state();
 505   __ restore_legacy_gprs();
 506   __ addq(rsp, 8);
 507   __ popf();
 508   // Get the rbp described implicitly by the calling convention (no oopMap)
 509   __ pop(rbp);
 510 }
 511 
 512 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 513 
 514   // Just restore result register. Only used by deoptimization. By
 515   // now any callee save register that needs to be restored to a c2
 516   // caller of the deoptee has been extracted into the vframeArray
 517   // and will be stuffed into the c2i adapter we create for later
 518   // restoration so only result registers need to be restored here.
 519 
 520   // Restore fp result register
 521   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 522   // Restore integer result register
 523   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 524   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 525 
 526   // Pop all of the register save are off the stack except the return address
 527   __ addptr(rsp, return_offset_in_bytes());
 528 }
 529 
 530 // Is vector's size (in bytes) bigger than a size saved by default?
 531 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 532 bool SharedRuntime::is_wide_vector(int size) {
 533   return size > 16;
 534 }
 535 
 536 // ---------------------------------------------------------------------------
 537 // Read the array of BasicTypes from a signature, and compute where the
 538 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 539 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 540 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 541 // as framesizes are fixed.
 542 // VMRegImpl::stack0 refers to the first slot 0(sp).
 543 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 544 // Register up to Register::number_of_registers are the 64-bit
 545 // integer registers.
 546 
 547 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 548 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 549 // units regardless of build. Of course for i486 there is no 64 bit build
 550 
 551 // The Java calling convention is a "shifted" version of the C ABI.
 552 // By skipping the first C ABI register we can call non-static jni methods
 553 // with small numbers of arguments without having to shuffle the arguments
 554 // at all. Since we control the java ABI we ought to at least get some
 555 // advantage out of it.
 556 
 557 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 558                                            VMRegPair *regs,
 559                                            int total_args_passed) {
 560 
 561   // Create the mapping between argument positions and
 562   // registers.
 563   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 564     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 565   };
 566   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 567     j_farg0, j_farg1, j_farg2, j_farg3,
 568     j_farg4, j_farg5, j_farg6, j_farg7
 569   };
 570 
 571 
 572   uint int_args = 0;
 573   uint fp_args = 0;
 574   uint stk_args = 0;
 575 
 576   for (int i = 0; i < total_args_passed; i++) {
 577     switch (sig_bt[i]) {
 578     case T_BOOLEAN:
 579     case T_CHAR:
 580     case T_BYTE:
 581     case T_SHORT:
 582     case T_INT:
 583       if (int_args < Argument::n_int_register_parameters_j) {
 584         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 585       } else {
 586         stk_args = align_up(stk_args, 2);
 587         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 588         stk_args += 1;
 589       }
 590       break;
 591     case T_VOID:
 592       // halves of T_LONG or T_DOUBLE
 593       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 594       regs[i].set_bad();
 595       break;
 596     case T_LONG:
 597       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 598       // fall through
 599     case T_OBJECT:
 600     case T_ARRAY:
 601     case T_ADDRESS:
 602       if (int_args < Argument::n_int_register_parameters_j) {
 603         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 604       } else {
 605         stk_args = align_up(stk_args, 2);
 606         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 607         stk_args += 2;
 608       }
 609       break;
 610     case T_FLOAT:
 611       if (fp_args < Argument::n_float_register_parameters_j) {
 612         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 613       } else {
 614         stk_args = align_up(stk_args, 2);
 615         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 616         stk_args += 1;
 617       }
 618       break;
 619     case T_DOUBLE:
 620       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 621       if (fp_args < Argument::n_float_register_parameters_j) {
 622         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 623       } else {
 624         stk_args = align_up(stk_args, 2);
 625         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 626         stk_args += 2;
 627       }
 628       break;
 629     default:
 630       ShouldNotReachHere();
 631       break;
 632     }
 633   }
 634 
 635   return stk_args;
 636 }
 637 
 638 // Same as java_calling_convention() but for multiple return
 639 // values. There's no way to store them on the stack so if we don't
 640 // have enough registers, multiple values can't be returned.
 641 const uint SharedRuntime::java_return_convention_max_int = Argument::n_int_register_parameters_j+1;
 642 const uint SharedRuntime::java_return_convention_max_float = Argument::n_float_register_parameters_j;
 643 int SharedRuntime::java_return_convention(const BasicType *sig_bt,
 644                                           VMRegPair *regs,
 645                                           int total_args_passed) {
 646   // Create the mapping between argument positions and
 647   // registers.
 648   static const Register INT_ArgReg[java_return_convention_max_int] = {
 649     rax, j_rarg5, j_rarg4, j_rarg3, j_rarg2, j_rarg1, j_rarg0
 650   };
 651   static const XMMRegister FP_ArgReg[java_return_convention_max_float] = {
 652     j_farg0, j_farg1, j_farg2, j_farg3,
 653     j_farg4, j_farg5, j_farg6, j_farg7
 654   };
 655 
 656 
 657   uint int_args = 0;
 658   uint fp_args = 0;
 659 
 660   for (int i = 0; i < total_args_passed; i++) {
 661     switch (sig_bt[i]) {
 662     case T_BOOLEAN:
 663     case T_CHAR:
 664     case T_BYTE:
 665     case T_SHORT:
 666     case T_INT:
 667       if (int_args < Argument::n_int_register_parameters_j+1) {
 668         regs[i].set1(INT_ArgReg[int_args]->as_VMReg());
 669         int_args++;
 670       } else {
 671         return -1;
 672       }
 673       break;
 674     case T_VOID:
 675       // halves of T_LONG or T_DOUBLE
 676       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 677       regs[i].set_bad();
 678       break;
 679     case T_LONG:
 680       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 681       // fall through
 682     case T_OBJECT:
 683     case T_ARRAY:
 684     case T_ADDRESS:
 685     case T_METADATA:
 686       if (int_args < Argument::n_int_register_parameters_j+1) {
 687         regs[i].set2(INT_ArgReg[int_args]->as_VMReg());
 688         int_args++;
 689       } else {
 690         return -1;
 691       }
 692       break;
 693     case T_FLOAT:
 694       if (fp_args < Argument::n_float_register_parameters_j) {
 695         regs[i].set1(FP_ArgReg[fp_args]->as_VMReg());
 696         fp_args++;
 697       } else {
 698         return -1;
 699       }
 700       break;
 701     case T_DOUBLE:
 702       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 703       if (fp_args < Argument::n_float_register_parameters_j) {
 704         regs[i].set2(FP_ArgReg[fp_args]->as_VMReg());
 705         fp_args++;
 706       } else {
 707         return -1;
 708       }
 709       break;
 710     default:
 711       ShouldNotReachHere();
 712       break;
 713     }
 714   }
 715 
 716   return int_args + fp_args;
 717 }
 718 
 719 // Patch the callers callsite with entry to compiled code if it exists.
 720 static void patch_callers_callsite(MacroAssembler *masm) {
 721   Label L;
 722   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 723   __ jcc(Assembler::equal, L);
 724 
 725   // Save the current stack pointer
 726   __ mov(r13, rsp);
 727   // Schedule the branch target address early.
 728   // Call into the VM to patch the caller, then jump to compiled callee
 729   // rax isn't live so capture return address while we easily can
 730   __ movptr(rax, Address(rsp, 0));
 731 
 732   // align stack so push_CPU_state doesn't fault
 733   __ andptr(rsp, -(StackAlignmentInBytes));
 734   __ push_CPU_state();
 735   __ vzeroupper();
 736   // VM needs caller's callsite
 737   // VM needs target method
 738   // This needs to be a long call since we will relocate this adapter to
 739   // the codeBuffer and it may not reach
 740 
 741   // Allocate argument register save area
 742   if (frame::arg_reg_save_area_bytes != 0) {
 743     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 744   }
 745   __ mov(c_rarg0, rbx);
 746   __ mov(c_rarg1, rax);
 747   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 748 
 749   // De-allocate argument register save area
 750   if (frame::arg_reg_save_area_bytes != 0) {
 751     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 752   }
 753 
 754   __ vzeroupper();
 755   __ pop_CPU_state();
 756   // restore sp
 757   __ mov(rsp, r13);
 758   __ bind(L);
 759 }
 760 
 761 // For each inline type argument, sig includes the list of fields of
 762 // the inline type. This utility function computes the number of
 763 // arguments for the call if inline types are passed by reference (the
 764 // calling convention the interpreter expects).
 765 static int compute_total_args_passed_int(const GrowableArray<SigEntry>* sig_extended) {
 766   int total_args_passed = 0;
 767   if (InlineTypePassFieldsAsArgs) {
 768     for (int i = 0; i < sig_extended->length(); i++) {
 769       BasicType bt = sig_extended->at(i)._bt;
 770       if (bt == T_METADATA) {
 771         // In sig_extended, an inline type argument starts with:
 772         // T_METADATA, followed by the types of the fields of the
 773         // inline type and T_VOID to mark the end of the value
 774         // type. Inline types are flattened so, for instance, in the
 775         // case of an inline type with an int field and an inline type
 776         // field that itself has 2 fields, an int and a long:
 777         // T_METADATA T_INT T_METADATA T_INT T_LONG T_VOID (second
 778         // slot for the T_LONG) T_VOID (inner inline type) T_VOID
 779         // (outer inline type)
 780         total_args_passed++;
 781         int vt = 1;
 782         do {
 783           i++;
 784           BasicType bt = sig_extended->at(i)._bt;
 785           BasicType prev_bt = sig_extended->at(i-1)._bt;
 786           if (bt == T_METADATA) {
 787             vt++;
 788           } else if (bt == T_VOID &&
 789                      prev_bt != T_LONG &&
 790                      prev_bt != T_DOUBLE) {
 791             vt--;
 792           }
 793         } while (vt != 0);
 794       } else {
 795         total_args_passed++;
 796       }
 797     }
 798   } else {
 799     total_args_passed = sig_extended->length();
 800   }
 801   return total_args_passed;
 802 }
 803 
 804 
 805 static void gen_c2i_adapter_helper(MacroAssembler* masm,
 806                                    BasicType bt,
 807                                    BasicType prev_bt,
 808                                    size_t size_in_bytes,
 809                                    const VMRegPair& reg_pair,
 810                                    const Address& to,
 811                                    int extraspace,
 812                                    bool is_oop) {
 813   if (bt == T_VOID) {
 814     assert(prev_bt == T_LONG || prev_bt == T_DOUBLE, "missing half");
 815     return;
 816   }
 817 
 818   // Say 4 args:
 819   // i   st_off
 820   // 0   32 T_LONG
 821   // 1   24 T_VOID
 822   // 2   16 T_OBJECT
 823   // 3    8 T_BOOL
 824   // -    0 return address
 825   //
 826   // However to make thing extra confusing. Because we can fit a long/double in
 827   // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 828   // leaves one slot empty and only stores to a single slot. In this case the
 829   // slot that is occupied is the T_VOID slot. See I said it was confusing.
 830 
 831   bool wide = (size_in_bytes == wordSize);
 832   VMReg r_1 = reg_pair.first();
 833   VMReg r_2 = reg_pair.second();
 834   assert(r_2->is_valid() == wide, "invalid size");
 835   if (!r_1->is_valid()) {
 836     assert(!r_2->is_valid(), "must be invalid");
 837     return;
 838   }
 839 
 840   if (!r_1->is_XMMRegister()) {
 841     Register val = rax;
 842     if (r_1->is_stack()) {
 843       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 844       __ load_sized_value(val, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false);
 845     } else {
 846       val = r_1->as_Register();
 847     }
 848     assert_different_registers(to.base(), val, rscratch1);
 849     if (is_oop) {
 850       __ push(r13);
 851       __ push(rbx);
 852       // store_heap_oop transitively calls oop_store_at which corrupts to.base(). We need to keep it valid.
 853       __ push(to.base());
 854       __ store_heap_oop(to, val, rscratch1, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
 855       __ pop(to.base());
 856       __ pop(rbx);
 857       __ pop(r13);
 858     } else {
 859       __ store_sized_value(to, val, size_in_bytes);
 860     }
 861   } else {
 862     if (wide) {
 863       __ movdbl(to, r_1->as_XMMRegister());
 864     } else {
 865       __ movflt(to, r_1->as_XMMRegister());
 866     }
 867   }
 868 }
 869 
 870 static void gen_c2i_adapter(MacroAssembler *masm,
 871                             const GrowableArray<SigEntry>* sig_extended,
 872                             const VMRegPair *regs,
 873                             bool requires_clinit_barrier,
 874                             address& c2i_no_clinit_check_entry,
 875                             Label& skip_fixup,
 876                             address start,
 877                             OopMapSet* oop_maps,
 878                             int& frame_complete,
 879                             int& frame_size_in_words,
 880                             bool alloc_inline_receiver) {
 881   if (requires_clinit_barrier) {
 882     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
 883     Label L_skip_barrier;
 884     Register method = rbx;
 885 
 886     { // Bypass the barrier for non-static methods
 887       Register flags = rscratch1;
 888       __ load_unsigned_short(flags, Address(method, Method::access_flags_offset()));
 889       __ testl(flags, JVM_ACC_STATIC);
 890       __ jcc(Assembler::zero, L_skip_barrier); // non-static
 891     }
 892 
 893     Register klass = rscratch1;
 894     __ load_method_holder(klass, method);
 895     __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
 896 
 897     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
 898 
 899     __ bind(L_skip_barrier);
 900     c2i_no_clinit_check_entry = __ pc();
 901   }
 902 
 903   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 904   bs->c2i_entry_barrier(masm);
 905 
 906   // Before we get into the guts of the C2I adapter, see if we should be here
 907   // at all.  We've come from compiled code and are attempting to jump to the
 908   // interpreter, which means the caller made a static call to get here
 909   // (vcalls always get a compiled target if there is one).  Check for a
 910   // compiled target.  If there is one, we need to patch the caller's call.
 911   patch_callers_callsite(masm);
 912 
 913   __ bind(skip_fixup);
 914 
 915   if (InlineTypePassFieldsAsArgs) {
 916     // Is there an inline type argument?
 917     bool has_inline_argument = false;
 918     for (int i = 0; i < sig_extended->length() && !has_inline_argument; i++) {
 919       has_inline_argument = (sig_extended->at(i)._bt == T_METADATA);
 920     }
 921     if (has_inline_argument) {
 922       // There is at least an inline type argument: we're coming from
 923       // compiled code so we have no buffers to back the inline types.
 924       // Allocate the buffers here with a runtime call.
 925       OopMap* map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
 926 
 927       frame_complete = __ offset();
 928 
 929       __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
 930 
 931       __ mov(c_rarg0, r15_thread);
 932       __ mov(c_rarg1, rbx);
 933       __ mov64(c_rarg2, (int64_t)alloc_inline_receiver);
 934       __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::allocate_inline_types)));
 935 
 936       oop_maps->add_gc_map((int)(__ pc() - start), map);
 937       __ reset_last_Java_frame(false);
 938 
 939       RegisterSaver::restore_live_registers(masm);
 940 
 941       Label no_exception;
 942       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
 943       __ jcc(Assembler::equal, no_exception);
 944 
 945       __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
 946       __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 947       __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
 948 
 949       __ bind(no_exception);
 950 
 951       // We get an array of objects from the runtime call
 952       __ get_vm_result_oop(rscratch2); // Use rscratch2 (r11) as temporary because rscratch1 (r10) is trashed by movptr()
 953       __ get_vm_result_metadata(rbx); // TODO: required to keep the callee Method live?
 954     }
 955   }
 956 
 957   // Since all args are passed on the stack, total_args_passed *
 958   // Interpreter::stackElementSize is the space we need.
 959   int total_args_passed = compute_total_args_passed_int(sig_extended);
 960   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 961 
 962   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 963 
 964   // stack is aligned, keep it that way
 965   // This is not currently needed or enforced by the interpreter, but
 966   // we might as well conform to the ABI.
 967   extraspace = align_up(extraspace, 2*wordSize);
 968 
 969   // set senderSP value
 970   __ lea(r13, Address(rsp, wordSize));
 971 
 972 #ifdef ASSERT
 973   __ check_stack_alignment(r13, "sender stack not aligned");
 974 #endif
 975   if (extraspace > 0) {
 976     // Pop the return address
 977     __ pop(rax);
 978 
 979     __ subptr(rsp, extraspace);
 980 
 981     // Push the return address
 982     __ push(rax);
 983 
 984     // Account for the return address location since we store it first rather
 985     // than hold it in a register across all the shuffling
 986     extraspace += wordSize;
 987   }
 988 
 989 #ifdef ASSERT
 990   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 991 #endif
 992 
 993   // Now write the args into the outgoing interpreter space
 994 
 995   // next_arg_comp is the next argument from the compiler point of
 996   // view (inline type fields are passed in registers/on the stack). In
 997   // sig_extended, an inline type argument starts with: T_METADATA,
 998   // followed by the types of the fields of the inline type and T_VOID
 999   // to mark the end of the inline type. ignored counts the number of
1000   // T_METADATA/T_VOID. next_vt_arg is the next inline type argument:
1001   // used to get the buffer for that argument from the pool of buffers
1002   // we allocated above and want to pass to the
1003   // interpreter. next_arg_int is the next argument from the
1004   // interpreter point of view (inline types are passed by reference).
1005   for (int next_arg_comp = 0, ignored = 0, next_vt_arg = 0, next_arg_int = 0;
1006        next_arg_comp < sig_extended->length(); next_arg_comp++) {
1007     assert(ignored <= next_arg_comp, "shouldn't skip over more slots than there are arguments");
1008     assert(next_arg_int <= total_args_passed, "more arguments for the interpreter than expected?");
1009     BasicType bt = sig_extended->at(next_arg_comp)._bt;
1010     int st_off = (total_args_passed - next_arg_int) * Interpreter::stackElementSize;
1011     if (!InlineTypePassFieldsAsArgs || bt != T_METADATA) {
1012       int next_off = st_off - Interpreter::stackElementSize;
1013       const int offset = (bt == T_LONG || bt == T_DOUBLE) ? next_off : st_off;
1014       const VMRegPair reg_pair = regs[next_arg_comp-ignored];
1015       size_t size_in_bytes = reg_pair.second()->is_valid() ? 8 : 4;
1016       gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
1017                              size_in_bytes, reg_pair, Address(rsp, offset), extraspace, false);
1018       next_arg_int++;
1019 #ifdef ASSERT
1020       if (bt == T_LONG || bt == T_DOUBLE) {
1021         // Overwrite the unused slot with known junk
1022         __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
1023         __ movptr(Address(rsp, st_off), rax);
1024       }
1025 #endif /* ASSERT */
1026     } else {
1027       ignored++;
1028       // get the buffer from the just allocated pool of buffers
1029       int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + next_vt_arg * type2aelembytes(T_OBJECT);
1030       __ load_heap_oop(r14, Address(rscratch2, index));
1031       next_vt_arg++; next_arg_int++;
1032       int vt = 1;
1033       // write fields we get from compiled code in registers/stack
1034       // slots to the buffer: we know we are done with that inline type
1035       // argument when we hit the T_VOID that acts as an end of inline
1036       // type delimiter for this inline type. Inline types are flattened
1037       // so we might encounter embedded inline types. Each entry in
1038       // sig_extended contains a field offset in the buffer.
1039       Label L_null;
1040       do {
1041         next_arg_comp++;
1042         BasicType bt = sig_extended->at(next_arg_comp)._bt;
1043         BasicType prev_bt = sig_extended->at(next_arg_comp-1)._bt;
1044         if (bt == T_METADATA) {
1045           vt++;
1046           ignored++;
1047         } else if (bt == T_VOID &&
1048                    prev_bt != T_LONG &&
1049                    prev_bt != T_DOUBLE) {
1050           vt--;
1051           ignored++;
1052         } else {
1053           int off = sig_extended->at(next_arg_comp)._offset;
1054           if (off == -1) {
1055             // Nullable inline type argument, emit null check
1056             VMReg reg = regs[next_arg_comp-ignored].first();
1057             Label L_notNull;
1058             if (reg->is_stack()) {
1059               int ld_off = reg->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
1060               __ testb(Address(rsp, ld_off), 1);
1061             } else {
1062               __ testb(reg->as_Register(), 1);
1063             }
1064             __ jcc(Assembler::notZero, L_notNull);
1065             __ movptr(Address(rsp, st_off), 0);
1066             __ jmp(L_null);
1067             __ bind(L_notNull);
1068             continue;
1069           }
1070           assert(off > 0, "offset in object should be positive");
1071           size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize;
1072           bool is_oop = is_reference_type(bt);
1073           gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
1074                                  size_in_bytes, regs[next_arg_comp-ignored], Address(r14, off), extraspace, is_oop);
1075         }
1076       } while (vt != 0);
1077       // pass the buffer to the interpreter
1078       __ movptr(Address(rsp, st_off), r14);
1079       __ bind(L_null);
1080     }
1081   }
1082 
1083   // Schedule the branch target address early.
1084   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
1085   __ jmp(rcx);
1086 }
1087 
1088 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
1089                                     int comp_args_on_stack,
1090                                     const GrowableArray<SigEntry>* sig,
1091                                     const VMRegPair *regs) {
1092 
1093   // Note: r13 contains the senderSP on entry. We must preserve it since
1094   // we may do a i2c -> c2i transition if we lose a race where compiled
1095   // code goes non-entrant while we get args ready.
1096   // In addition we use r13 to locate all the interpreter args as
1097   // we must align the stack to 16 bytes on an i2c entry else we
1098   // lose alignment we expect in all compiled code and register
1099   // save code can segv when fxsave instructions find improperly
1100   // aligned stack pointer.
1101 
1102   // Adapters can be frameless because they do not require the caller
1103   // to perform additional cleanup work, such as correcting the stack pointer.
1104   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
1105   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
1106   // even if a callee has modified the stack pointer.
1107   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
1108   // routinely repairs its caller's stack pointer (from sender_sp, which is set
1109   // up via the senderSP register).
1110   // In other words, if *either* the caller or callee is interpreted, we can
1111   // get the stack pointer repaired after a call.
1112   // This is why c2i and i2c adapters cannot be indefinitely composed.
1113   // In particular, if a c2i adapter were to somehow call an i2c adapter,
1114   // both caller and callee would be compiled methods, and neither would
1115   // clean up the stack pointer changes performed by the two adapters.
1116   // If this happens, control eventually transfers back to the compiled
1117   // caller, but with an uncorrected stack, causing delayed havoc.
1118 
1119   // Must preserve original SP for loading incoming arguments because
1120   // we need to align the outgoing SP for compiled code.
1121   __ movptr(r11, rsp);
1122 
1123   // Pick up the return address
1124   __ pop(rax);
1125 
1126   // Convert 4-byte c2 stack slots to words.
1127   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
1128 
1129   if (comp_args_on_stack) {
1130     __ subptr(rsp, comp_words_on_stack * wordSize);
1131   }
1132 
1133   // Ensure compiled code always sees stack at proper alignment
1134   __ andptr(rsp, -16);
1135 
1136   // push the return address and misalign the stack that youngest frame always sees
1137   // as far as the placement of the call instruction
1138   __ push(rax);
1139 
1140   // Put saved SP in another register
1141   const Register saved_sp = rax;
1142   __ movptr(saved_sp, r11);
1143 
1144   // Will jump to the compiled code just as if compiled code was doing it.
1145   // Pre-load the register-jump target early, to schedule it better.
1146   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_inline_offset())));
1147 
1148 #if INCLUDE_JVMCI
1149   if (EnableJVMCI) {
1150     // check if this call should be routed towards a specific entry point
1151     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1152     Label no_alternative_target;
1153     __ jcc(Assembler::equal, no_alternative_target);
1154     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
1155     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1156     __ bind(no_alternative_target);
1157   }
1158 #endif // INCLUDE_JVMCI
1159 
1160   int total_args_passed = sig->length();
1161 
1162   // Now generate the shuffle code.  Pick up all register args and move the
1163   // rest through the floating point stack top.
1164   for (int i = 0; i < total_args_passed; i++) {
1165     BasicType bt = sig->at(i)._bt;
1166     if (bt == T_VOID) {
1167       // Longs and doubles are passed in native word order, but misaligned
1168       // in the 32-bit build.
1169       BasicType prev_bt = (i > 0) ? sig->at(i-1)._bt : T_ILLEGAL;
1170       assert(i > 0 && (prev_bt == T_LONG || prev_bt == T_DOUBLE), "missing half");
1171       continue;
1172     }
1173 
1174     // Pick up 0, 1 or 2 words from SP+offset.
1175 
1176     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
1177             "scrambled load targets?");
1178     // Load in argument order going down.
1179     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
1180     // Point to interpreter value (vs. tag)
1181     int next_off = ld_off - Interpreter::stackElementSize;
1182     //
1183     //
1184     //
1185     VMReg r_1 = regs[i].first();
1186     VMReg r_2 = regs[i].second();
1187     if (!r_1->is_valid()) {
1188       assert(!r_2->is_valid(), "");
1189       continue;
1190     }
1191     if (r_1->is_stack()) {
1192       // Convert stack slot to an SP offset (+ wordSize to account for return address )
1193       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
1194 
1195       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
1196       // and if we end up going thru a c2i because of a miss a reasonable value of r13
1197       // will be generated.
1198       if (!r_2->is_valid()) {
1199         // sign extend???
1200         __ movl(r13, Address(saved_sp, ld_off));
1201         __ movptr(Address(rsp, st_off), r13);
1202       } else {
1203         //
1204         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1205         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1206         // So we must adjust where to pick up the data to match the interpreter.
1207         //
1208         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
1209         // are accessed as negative so LSW is at LOW address
1210 
1211         // ld_off is MSW so get LSW
1212         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1213                            next_off : ld_off;
1214         __ movq(r13, Address(saved_sp, offset));
1215         // st_off is LSW (i.e. reg.first())
1216         __ movq(Address(rsp, st_off), r13);
1217       }
1218     } else if (r_1->is_Register()) {  // Register argument
1219       Register r = r_1->as_Register();
1220       assert(r != rax, "must be different");
1221       if (r_2->is_valid()) {
1222         //
1223         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1224         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1225         // So we must adjust where to pick up the data to match the interpreter.
1226 
1227         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1228                            next_off : ld_off;
1229 
1230         // this can be a misaligned move
1231         __ movq(r, Address(saved_sp, offset));
1232       } else {
1233         // sign extend and use a full word?
1234         __ movl(r, Address(saved_sp, ld_off));
1235       }
1236     } else {
1237       if (!r_2->is_valid()) {
1238         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
1239       } else {
1240         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
1241       }
1242     }
1243   }
1244 
1245   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
1246 
1247   // 6243940 We might end up in handle_wrong_method if
1248   // the callee is deoptimized as we race thru here. If that
1249   // happens we don't want to take a safepoint because the
1250   // caller frame will look interpreted and arguments are now
1251   // "compiled" so it is much better to make this transition
1252   // invisible to the stack walking code. Unfortunately if
1253   // we try and find the callee by normal means a safepoint
1254   // is possible. So we stash the desired callee in the thread
1255   // and the vm will find there should this case occur.
1256 
1257   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1258 
1259   // put Method* where a c2i would expect should we end up there
1260   // only needed because of c2 resolve stubs return Method* as a result in
1261   // rax
1262   __ mov(rax, rbx);
1263   __ jmp(r11);
1264 }
1265 
1266 static void gen_inline_cache_check(MacroAssembler *masm, Label& skip_fixup) {
1267   Register data = rax;
1268   __ ic_check(1 /* end_alignment */);
1269   __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1270 
1271   // Method might have been compiled since the call site was patched to
1272   // interpreted if that is the case treat it as a miss so we can get
1273   // the call site corrected.
1274   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1275   __ jcc(Assembler::equal, skip_fixup);
1276   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1277 }
1278 
1279 // ---------------------------------------------------------------
1280 void SharedRuntime::generate_i2c2i_adapters(MacroAssembler* masm,
1281                                             int comp_args_on_stack,
1282                                             const GrowableArray<SigEntry>* sig,
1283                                             const VMRegPair* regs,
1284                                             const GrowableArray<SigEntry>* sig_cc,
1285                                             const VMRegPair* regs_cc,
1286                                             const GrowableArray<SigEntry>* sig_cc_ro,
1287                                             const VMRegPair* regs_cc_ro,
1288                                             address entry_address[AdapterBlob::ENTRY_COUNT],
1289                                             AdapterBlob*& new_adapter,
1290                                             bool allocate_code_blob) {
1291   entry_address[AdapterBlob::I2C] = __ pc();
1292   gen_i2c_adapter(masm, comp_args_on_stack, sig, regs);
1293 
1294   // -------------------------------------------------------------------------
1295   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1296   // to the interpreter.  The args start out packed in the compiled layout.  They
1297   // need to be unpacked into the interpreter layout.  This will almost always
1298   // require some stack space.  We grow the current (compiled) stack, then repack
1299   // the args.  We  finally end in a jump to the generic interpreter entry point.
1300   // On exit from the interpreter, the interpreter will restore our SP (lest the
1301   // compiled code, which relies solely on SP and not RBP, get sick).
1302 
1303   entry_address[AdapterBlob::C2I_Unverified] = __ pc();
1304   entry_address[AdapterBlob::C2I_Unverified_Inline] = __ pc();
1305   Label skip_fixup;
1306 
1307   gen_inline_cache_check(masm, skip_fixup);
1308 
1309   OopMapSet* oop_maps = new OopMapSet();
1310   int frame_complete = CodeOffsets::frame_never_safe;
1311   int frame_size_in_words = 0;
1312 
1313   // Scalarized c2i adapter with non-scalarized receiver (i.e., don't pack receiver)
1314   entry_address[AdapterBlob::C2I_No_Clinit_Check] = nullptr;
1315   entry_address[AdapterBlob::C2I_Inline_RO] = __ pc();
1316   if (regs_cc != regs_cc_ro) {
1317     // No class init barrier needed because method is guaranteed to be non-static
1318     gen_c2i_adapter(masm, sig_cc_ro, regs_cc_ro, /* requires_clinit_barrier = */ false, entry_address[AdapterBlob::C2I_No_Clinit_Check],
1319                     skip_fixup, entry_address[AdapterBlob::I2C], oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1320     skip_fixup.reset();
1321   }
1322 
1323   // Scalarized c2i adapter
1324   entry_address[AdapterBlob::C2I]        = __ pc();
1325   entry_address[AdapterBlob::C2I_Inline] = __ pc();
1326   gen_c2i_adapter(masm, sig_cc, regs_cc, /* requires_clinit_barrier = */ true, entry_address[AdapterBlob::C2I_No_Clinit_Check],
1327                   skip_fixup, entry_address[AdapterBlob::I2C], oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ true);
1328 
1329   // Non-scalarized c2i adapter
1330   if (regs != regs_cc) {
1331     entry_address[AdapterBlob::C2I_Unverified_Inline] = __ pc();
1332     Label inline_entry_skip_fixup;
1333     gen_inline_cache_check(masm, inline_entry_skip_fixup);
1334 
1335     entry_address[AdapterBlob::C2I_Inline] = __ pc();
1336     gen_c2i_adapter(masm, sig, regs, /* requires_clinit_barrier = */ true, entry_address[AdapterBlob::C2I_No_Clinit_Check],
1337                     inline_entry_skip_fixup, entry_address[AdapterBlob::I2C], oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1338   }
1339 
1340   // The c2i adapters might safepoint and trigger a GC. The caller must make sure that
1341   // the GC knows about the location of oop argument locations passed to the c2i adapter.
1342   if (allocate_code_blob) {
1343     bool caller_must_gc_arguments = (regs != regs_cc);
1344     int entry_offset[AdapterHandlerEntry::ENTRIES_COUNT];
1345     assert(AdapterHandlerEntry::ENTRIES_COUNT == 7, "sanity");
1346     AdapterHandlerLibrary::address_to_offset(entry_address, entry_offset);
1347     new_adapter = AdapterBlob::create(masm->code(), entry_offset, frame_complete, frame_size_in_words, oop_maps, caller_must_gc_arguments);
1348   }
1349 }
1350 
1351 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1352                                          VMRegPair *regs,
1353                                          int total_args_passed) {
1354 
1355 // We return the amount of VMRegImpl stack slots we need to reserve for all
1356 // the arguments NOT counting out_preserve_stack_slots.
1357 
1358 // NOTE: These arrays will have to change when c1 is ported
1359 #ifdef _WIN64
1360     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1361       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1362     };
1363     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1364       c_farg0, c_farg1, c_farg2, c_farg3
1365     };
1366 #else
1367     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1368       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1369     };
1370     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1371       c_farg0, c_farg1, c_farg2, c_farg3,
1372       c_farg4, c_farg5, c_farg6, c_farg7
1373     };
1374 #endif // _WIN64
1375 
1376 
1377     uint int_args = 0;
1378     uint fp_args = 0;
1379     uint stk_args = 0; // inc by 2 each time
1380 
1381     for (int i = 0; i < total_args_passed; i++) {
1382       switch (sig_bt[i]) {
1383       case T_BOOLEAN:
1384       case T_CHAR:
1385       case T_BYTE:
1386       case T_SHORT:
1387       case T_INT:
1388         if (int_args < Argument::n_int_register_parameters_c) {
1389           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1390 #ifdef _WIN64
1391           fp_args++;
1392           // Allocate slots for callee to stuff register args the stack.
1393           stk_args += 2;
1394 #endif
1395         } else {
1396           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1397           stk_args += 2;
1398         }
1399         break;
1400       case T_LONG:
1401         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1402         // fall through
1403       case T_OBJECT:
1404       case T_ARRAY:
1405       case T_ADDRESS:
1406       case T_METADATA:
1407         if (int_args < Argument::n_int_register_parameters_c) {
1408           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1409 #ifdef _WIN64
1410           fp_args++;
1411           stk_args += 2;
1412 #endif
1413         } else {
1414           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1415           stk_args += 2;
1416         }
1417         break;
1418       case T_FLOAT:
1419         if (fp_args < Argument::n_float_register_parameters_c) {
1420           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1421 #ifdef _WIN64
1422           int_args++;
1423           // Allocate slots for callee to stuff register args the stack.
1424           stk_args += 2;
1425 #endif
1426         } else {
1427           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1428           stk_args += 2;
1429         }
1430         break;
1431       case T_DOUBLE:
1432         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1433         if (fp_args < Argument::n_float_register_parameters_c) {
1434           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1435 #ifdef _WIN64
1436           int_args++;
1437           // Allocate slots for callee to stuff register args the stack.
1438           stk_args += 2;
1439 #endif
1440         } else {
1441           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1442           stk_args += 2;
1443         }
1444         break;
1445       case T_VOID: // Halves of longs and doubles
1446         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1447         regs[i].set_bad();
1448         break;
1449       default:
1450         ShouldNotReachHere();
1451         break;
1452       }
1453     }
1454 #ifdef _WIN64
1455   // windows abi requires that we always allocate enough stack space
1456   // for 4 64bit registers to be stored down.
1457   if (stk_args < 8) {
1458     stk_args = 8;
1459   }
1460 #endif // _WIN64
1461 
1462   return stk_args;
1463 }
1464 
1465 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1466                                              uint num_bits,
1467                                              uint total_args_passed) {
1468   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1469          "only certain vector sizes are supported for now");
1470 
1471   static const XMMRegister VEC_ArgReg[32] = {
1472      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1473      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1474     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1475     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1476   };
1477 
1478   uint stk_args = 0;
1479   uint fp_args = 0;
1480 
1481   for (uint i = 0; i < total_args_passed; i++) {
1482     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1483     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1484     regs[i].set_pair(vmreg->next(next_val), vmreg);
1485   }
1486 
1487   return stk_args;
1488 }
1489 
1490 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1491   // We always ignore the frame_slots arg and just use the space just below frame pointer
1492   // which by this time is free to use
1493   switch (ret_type) {
1494   case T_FLOAT:
1495     __ movflt(Address(rbp, -wordSize), xmm0);
1496     break;
1497   case T_DOUBLE:
1498     __ movdbl(Address(rbp, -wordSize), xmm0);
1499     break;
1500   case T_VOID:  break;
1501   default: {
1502     __ movptr(Address(rbp, -wordSize), rax);
1503     }
1504   }
1505 }
1506 
1507 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1508   // We always ignore the frame_slots arg and just use the space just below frame pointer
1509   // which by this time is free to use
1510   switch (ret_type) {
1511   case T_FLOAT:
1512     __ movflt(xmm0, Address(rbp, -wordSize));
1513     break;
1514   case T_DOUBLE:
1515     __ movdbl(xmm0, Address(rbp, -wordSize));
1516     break;
1517   case T_VOID:  break;
1518   default: {
1519     __ movptr(rax, Address(rbp, -wordSize));
1520     }
1521   }
1522 }
1523 
1524 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1525     for ( int i = first_arg ; i < arg_count ; i++ ) {
1526       if (args[i].first()->is_Register()) {
1527         __ push(args[i].first()->as_Register());
1528       } else if (args[i].first()->is_XMMRegister()) {
1529         __ subptr(rsp, 2*wordSize);
1530         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1531       }
1532     }
1533 }
1534 
1535 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1536     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1537       if (args[i].first()->is_Register()) {
1538         __ pop(args[i].first()->as_Register());
1539       } else if (args[i].first()->is_XMMRegister()) {
1540         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1541         __ addptr(rsp, 2*wordSize);
1542       }
1543     }
1544 }
1545 
1546 static void verify_oop_args(MacroAssembler* masm,
1547                             const methodHandle& method,
1548                             const BasicType* sig_bt,
1549                             const VMRegPair* regs) {
1550   Register temp_reg = rbx;  // not part of any compiled calling seq
1551   if (VerifyOops) {
1552     for (int i = 0; i < method->size_of_parameters(); i++) {
1553       if (is_reference_type(sig_bt[i])) {
1554         VMReg r = regs[i].first();
1555         assert(r->is_valid(), "bad oop arg");
1556         if (r->is_stack()) {
1557           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1558           __ verify_oop(temp_reg);
1559         } else {
1560           __ verify_oop(r->as_Register());
1561         }
1562       }
1563     }
1564   }
1565 }
1566 
1567 static void check_continuation_enter_argument(VMReg actual_vmreg,
1568                                               Register expected_reg,
1569                                               const char* name) {
1570   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1571   assert(actual_vmreg->as_Register() == expected_reg,
1572          "%s is in unexpected register: %s instead of %s",
1573          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1574 }
1575 
1576 
1577 //---------------------------- continuation_enter_setup ---------------------------
1578 //
1579 // Arguments:
1580 //   None.
1581 //
1582 // Results:
1583 //   rsp: pointer to blank ContinuationEntry
1584 //
1585 // Kills:
1586 //   rax
1587 //
1588 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1589   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1590   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1591   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1592 
1593   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1594   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1595 
1596   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1597   OopMap* map = new OopMap(frame_size, 0);
1598 
1599   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1600   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1601   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1602 
1603   return map;
1604 }
1605 
1606 //---------------------------- fill_continuation_entry ---------------------------
1607 //
1608 // Arguments:
1609 //   rsp: pointer to blank Continuation entry
1610 //   reg_cont_obj: pointer to the continuation
1611 //   reg_flags: flags
1612 //
1613 // Results:
1614 //   rsp: pointer to filled out ContinuationEntry
1615 //
1616 // Kills:
1617 //   rax
1618 //
1619 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1620   assert_different_registers(rax, reg_cont_obj, reg_flags);
1621 #ifdef ASSERT
1622   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1623 #endif
1624   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1625   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1626   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1627   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1628   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1629 
1630   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1631   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1632 
1633   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1634 }
1635 
1636 //---------------------------- continuation_enter_cleanup ---------------------------
1637 //
1638 // Arguments:
1639 //   rsp: pointer to the ContinuationEntry
1640 //
1641 // Results:
1642 //   rsp: pointer to the spilled rbp in the entry frame
1643 //
1644 // Kills:
1645 //   rbx
1646 //
1647 static void continuation_enter_cleanup(MacroAssembler* masm) {
1648 #ifdef ASSERT
1649   Label L_good_sp;
1650   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1651   __ jcc(Assembler::equal, L_good_sp);
1652   __ stop("Incorrect rsp at continuation_enter_cleanup");
1653   __ bind(L_good_sp);
1654 #endif
1655   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1656   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1657   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1658   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1659   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1660 }
1661 
1662 static void gen_continuation_enter(MacroAssembler* masm,
1663                                    const VMRegPair* regs,
1664                                    int& exception_offset,
1665                                    OopMapSet* oop_maps,
1666                                    int& frame_complete,
1667                                    int& stack_slots,
1668                                    int& interpreted_entry_offset,
1669                                    int& compiled_entry_offset) {
1670 
1671   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1672   int pos_cont_obj   = 0;
1673   int pos_is_cont    = 1;
1674   int pos_is_virtual = 2;
1675 
1676   // The platform-specific calling convention may present the arguments in various registers.
1677   // To simplify the rest of the code, we expect the arguments to reside at these known
1678   // registers, and we additionally check the placement here in case calling convention ever
1679   // changes.
1680   Register reg_cont_obj   = c_rarg1;
1681   Register reg_is_cont    = c_rarg2;
1682   Register reg_is_virtual = c_rarg3;
1683 
1684   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1685   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1686   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1687 
1688   // Utility methods kill rax, make sure there are no collisions
1689   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1690 
1691   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1692                          relocInfo::static_call_type);
1693 
1694   address start = __ pc();
1695 
1696   Label L_thaw, L_exit;
1697 
1698   // i2i entry used at interp_only_mode only
1699   interpreted_entry_offset = __ pc() - start;
1700   {
1701 #ifdef ASSERT
1702     Label is_interp_only;
1703     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1704     __ jcc(Assembler::notEqual, is_interp_only);
1705     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1706     __ bind(is_interp_only);
1707 #endif
1708 
1709     __ pop(rax); // return address
1710     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1711     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1712     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1713     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1714     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1715     __ push(rax); // return address
1716     __ push_cont_fastpath();
1717 
1718     __ enter();
1719 
1720     stack_slots = 2; // will be adjusted in setup
1721     OopMap* map = continuation_enter_setup(masm, stack_slots);
1722     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1723     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1724 
1725     __ verify_oop(reg_cont_obj);
1726 
1727     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1728 
1729     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1730     __ testptr(reg_is_cont, reg_is_cont);
1731     __ jcc(Assembler::notZero, L_thaw);
1732 
1733     // --- Resolve path
1734 
1735     // Make sure the call is patchable
1736     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1737     // Emit stub for static call
1738     address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1739     if (stub == nullptr) {
1740       fatal("CodeCache is full at gen_continuation_enter");
1741     }
1742     __ call(resolve);
1743     oop_maps->add_gc_map(__ pc() - start, map);
1744     __ post_call_nop();
1745 
1746     __ jmp(L_exit);
1747   }
1748 
1749   // compiled entry
1750   __ align(CodeEntryAlignment);
1751   compiled_entry_offset = __ pc() - start;
1752   __ enter();
1753 
1754   stack_slots = 2; // will be adjusted in setup
1755   OopMap* map = continuation_enter_setup(masm, stack_slots);
1756 
1757   // Frame is now completed as far as size and linkage.
1758   frame_complete = __ pc() - start;
1759 
1760   __ verify_oop(reg_cont_obj);
1761 
1762   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1763 
1764   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1765   __ testptr(reg_is_cont, reg_is_cont);
1766   __ jccb(Assembler::notZero, L_thaw);
1767 
1768   // --- call Continuation.enter(Continuation c, boolean isContinue)
1769 
1770   // Make sure the call is patchable
1771   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1772 
1773   // Emit stub for static call
1774   address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1775   if (stub == nullptr) {
1776     fatal("CodeCache is full at gen_continuation_enter");
1777   }
1778 
1779   // The call needs to be resolved. There's a special case for this in
1780   // SharedRuntime::find_callee_info_helper() which calls
1781   // LinkResolver::resolve_continuation_enter() which resolves the call to
1782   // Continuation.enter(Continuation c, boolean isContinue).
1783   __ call(resolve);
1784 
1785   oop_maps->add_gc_map(__ pc() - start, map);
1786   __ post_call_nop();
1787 
1788   __ jmpb(L_exit);
1789 
1790   // --- Thawing path
1791 
1792   __ bind(L_thaw);
1793 
1794   ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1795   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1796 
1797   ContinuationEntry::_return_pc_offset = __ pc() - start;
1798   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1799   __ post_call_nop();
1800 
1801   // --- Normal exit (resolve/thawing)
1802 
1803   __ bind(L_exit);
1804   ContinuationEntry::_cleanup_offset = __ pc() - start;
1805   continuation_enter_cleanup(masm);
1806   __ pop(rbp);
1807   __ ret(0);
1808 
1809   // --- Exception handling path
1810 
1811   exception_offset = __ pc() - start;
1812 
1813   continuation_enter_cleanup(masm);
1814   __ pop(rbp);
1815 
1816   __ movptr(c_rarg0, r15_thread);
1817   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1818 
1819   // rax still holds the original exception oop, save it before the call
1820   __ push(rax);
1821 
1822   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1823   __ movptr(rbx, rax);
1824 
1825   // Continue at exception handler:
1826   //   rax: exception oop
1827   //   rbx: exception handler
1828   //   rdx: exception pc
1829   __ pop(rax);
1830   __ verify_oop(rax);
1831   __ pop(rdx);
1832   __ jmp(rbx);
1833 }
1834 
1835 static void gen_continuation_yield(MacroAssembler* masm,
1836                                    const VMRegPair* regs,
1837                                    OopMapSet* oop_maps,
1838                                    int& frame_complete,
1839                                    int& stack_slots,
1840                                    int& compiled_entry_offset) {
1841   enum layout {
1842     rbp_off,
1843     rbpH_off,
1844     return_off,
1845     return_off2,
1846     framesize // inclusive of return address
1847   };
1848   stack_slots = framesize /  VMRegImpl::slots_per_word;
1849   assert(stack_slots == 2, "recheck layout");
1850 
1851   address start = __ pc();
1852   compiled_entry_offset = __ pc() - start;
1853   __ enter();
1854   address the_pc = __ pc();
1855 
1856   frame_complete = the_pc - start;
1857 
1858   // This nop must be exactly at the PC we push into the frame info.
1859   // We use this nop for fast CodeBlob lookup, associate the OopMap
1860   // with it right away.
1861   __ post_call_nop();
1862   OopMap* map = new OopMap(framesize, 1);
1863   oop_maps->add_gc_map(frame_complete, map);
1864 
1865   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1866   __ movptr(c_rarg0, r15_thread);
1867   __ movptr(c_rarg1, rsp);
1868   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1869   __ reset_last_Java_frame(true);
1870 
1871   Label L_pinned;
1872 
1873   __ testptr(rax, rax);
1874   __ jcc(Assembler::notZero, L_pinned);
1875 
1876   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1877   continuation_enter_cleanup(masm);
1878   __ pop(rbp);
1879   __ ret(0);
1880 
1881   __ bind(L_pinned);
1882 
1883   // Pinned, return to caller
1884 
1885   // handle pending exception thrown by freeze
1886   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1887   Label ok;
1888   __ jcc(Assembler::equal, ok);
1889   __ leave();
1890   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1891   __ bind(ok);
1892 
1893   __ leave();
1894   __ ret(0);
1895 }
1896 
1897 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1898   ::continuation_enter_cleanup(masm);
1899 }
1900 
1901 static void gen_special_dispatch(MacroAssembler* masm,
1902                                  const methodHandle& method,
1903                                  const BasicType* sig_bt,
1904                                  const VMRegPair* regs) {
1905   verify_oop_args(masm, method, sig_bt, regs);
1906   vmIntrinsics::ID iid = method->intrinsic_id();
1907 
1908   // Now write the args into the outgoing interpreter space
1909   bool     has_receiver   = false;
1910   Register receiver_reg   = noreg;
1911   int      member_arg_pos = -1;
1912   Register member_reg     = noreg;
1913   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1914   if (ref_kind != 0) {
1915     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1916     member_reg = rbx;  // known to be free at this point
1917     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1918   } else if (iid == vmIntrinsics::_invokeBasic) {
1919     has_receiver = true;
1920   } else if (iid == vmIntrinsics::_linkToNative) {
1921     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1922     member_reg = rbx;  // known to be free at this point
1923   } else {
1924     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1925   }
1926 
1927   if (member_reg != noreg) {
1928     // Load the member_arg into register, if necessary.
1929     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1930     VMReg r = regs[member_arg_pos].first();
1931     if (r->is_stack()) {
1932       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1933     } else {
1934       // no data motion is needed
1935       member_reg = r->as_Register();
1936     }
1937   }
1938 
1939   if (has_receiver) {
1940     // Make sure the receiver is loaded into a register.
1941     assert(method->size_of_parameters() > 0, "oob");
1942     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1943     VMReg r = regs[0].first();
1944     assert(r->is_valid(), "bad receiver arg");
1945     if (r->is_stack()) {
1946       // Porting note:  This assumes that compiled calling conventions always
1947       // pass the receiver oop in a register.  If this is not true on some
1948       // platform, pick a temp and load the receiver from stack.
1949       fatal("receiver always in a register");
1950       receiver_reg = j_rarg0;  // known to be free at this point
1951       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1952     } else {
1953       // no data motion is needed
1954       receiver_reg = r->as_Register();
1955     }
1956   }
1957 
1958   // Figure out which address we are really jumping to:
1959   MethodHandles::generate_method_handle_dispatch(masm, iid,
1960                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1961 }
1962 
1963 // ---------------------------------------------------------------------------
1964 // Generate a native wrapper for a given method.  The method takes arguments
1965 // in the Java compiled code convention, marshals them to the native
1966 // convention (handlizes oops, etc), transitions to native, makes the call,
1967 // returns to java state (possibly blocking), unhandlizes any result and
1968 // returns.
1969 //
1970 // Critical native functions are a shorthand for the use of
1971 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1972 // functions.  The wrapper is expected to unpack the arguments before
1973 // passing them to the callee. Critical native functions leave the state _in_Java,
1974 // since they cannot stop for GC.
1975 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1976 // block and the check for pending exceptions it's impossible for them
1977 // to be thrown.
1978 //
1979 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1980                                                 const methodHandle& method,
1981                                                 int compile_id,
1982                                                 BasicType* in_sig_bt,
1983                                                 VMRegPair* in_regs,
1984                                                 BasicType ret_type) {
1985   if (method->is_continuation_native_intrinsic()) {
1986     int exception_offset = -1;
1987     OopMapSet* oop_maps = new OopMapSet();
1988     int frame_complete = -1;
1989     int stack_slots = -1;
1990     int interpreted_entry_offset = -1;
1991     int vep_offset = -1;
1992     if (method->is_continuation_enter_intrinsic()) {
1993       gen_continuation_enter(masm,
1994                              in_regs,
1995                              exception_offset,
1996                              oop_maps,
1997                              frame_complete,
1998                              stack_slots,
1999                              interpreted_entry_offset,
2000                              vep_offset);
2001     } else if (method->is_continuation_yield_intrinsic()) {
2002       gen_continuation_yield(masm,
2003                              in_regs,
2004                              oop_maps,
2005                              frame_complete,
2006                              stack_slots,
2007                              vep_offset);
2008     } else {
2009       guarantee(false, "Unknown Continuation native intrinsic");
2010     }
2011 
2012 #ifdef ASSERT
2013     if (method->is_continuation_enter_intrinsic()) {
2014       assert(interpreted_entry_offset != -1, "Must be set");
2015       assert(exception_offset != -1,         "Must be set");
2016     } else {
2017       assert(interpreted_entry_offset == -1, "Must be unset");
2018       assert(exception_offset == -1,         "Must be unset");
2019     }
2020     assert(frame_complete != -1,    "Must be set");
2021     assert(stack_slots != -1,       "Must be set");
2022     assert(vep_offset != -1,        "Must be set");
2023 #endif
2024 
2025     __ flush();
2026     nmethod* nm = nmethod::new_native_nmethod(method,
2027                                               compile_id,
2028                                               masm->code(),
2029                                               vep_offset,
2030                                               frame_complete,
2031                                               stack_slots,
2032                                               in_ByteSize(-1),
2033                                               in_ByteSize(-1),
2034                                               oop_maps,
2035                                               exception_offset);
2036     if (nm == nullptr) return nm;
2037     if (method->is_continuation_enter_intrinsic()) {
2038       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
2039     } else if (method->is_continuation_yield_intrinsic()) {
2040       _cont_doYield_stub = nm;
2041     }
2042     return nm;
2043   }
2044 
2045   if (method->is_method_handle_intrinsic()) {
2046     vmIntrinsics::ID iid = method->intrinsic_id();
2047     intptr_t start = (intptr_t)__ pc();
2048     int vep_offset = ((intptr_t)__ pc()) - start;
2049     gen_special_dispatch(masm,
2050                          method,
2051                          in_sig_bt,
2052                          in_regs);
2053     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
2054     __ flush();
2055     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
2056     return nmethod::new_native_nmethod(method,
2057                                        compile_id,
2058                                        masm->code(),
2059                                        vep_offset,
2060                                        frame_complete,
2061                                        stack_slots / VMRegImpl::slots_per_word,
2062                                        in_ByteSize(-1),
2063                                        in_ByteSize(-1),
2064                                        nullptr);
2065   }
2066   address native_func = method->native_function();
2067   assert(native_func != nullptr, "must have function");
2068 
2069   // An OopMap for lock (and class if static)
2070   OopMapSet *oop_maps = new OopMapSet();
2071   intptr_t start = (intptr_t)__ pc();
2072 
2073   // We have received a description of where all the java arg are located
2074   // on entry to the wrapper. We need to convert these args to where
2075   // the jni function will expect them. To figure out where they go
2076   // we convert the java signature to a C signature by inserting
2077   // the hidden arguments as arg[0] and possibly arg[1] (static method)
2078 
2079   const int total_in_args = method->size_of_parameters();
2080   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
2081 
2082   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
2083   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
2084 
2085   int argc = 0;
2086   out_sig_bt[argc++] = T_ADDRESS;
2087   if (method->is_static()) {
2088     out_sig_bt[argc++] = T_OBJECT;
2089   }
2090 
2091   for (int i = 0; i < total_in_args ; i++ ) {
2092     out_sig_bt[argc++] = in_sig_bt[i];
2093   }
2094 
2095   // Now figure out where the args must be stored and how much stack space
2096   // they require.
2097   int out_arg_slots;
2098   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
2099 
2100   // Compute framesize for the wrapper.  We need to handlize all oops in
2101   // incoming registers
2102 
2103   // Calculate the total number of stack slots we will need.
2104 
2105   // First count the abi requirement plus all of the outgoing args
2106   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
2107 
2108   // Now the space for the inbound oop handle area
2109   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
2110 
2111   int oop_handle_offset = stack_slots;
2112   stack_slots += total_save_slots;
2113 
2114   // Now any space we need for handlizing a klass if static method
2115 
2116   int klass_slot_offset = 0;
2117   int klass_offset = -1;
2118   int lock_slot_offset = 0;
2119   bool is_static = false;
2120 
2121   if (method->is_static()) {
2122     klass_slot_offset = stack_slots;
2123     stack_slots += VMRegImpl::slots_per_word;
2124     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
2125     is_static = true;
2126   }
2127 
2128   // Plus a lock if needed
2129 
2130   if (method->is_synchronized()) {
2131     lock_slot_offset = stack_slots;
2132     stack_slots += VMRegImpl::slots_per_word;
2133   }
2134 
2135   // Now a place (+2) to save return values or temp during shuffling
2136   // + 4 for return address (which we own) and saved rbp
2137   stack_slots += 6;
2138 
2139   // Ok The space we have allocated will look like:
2140   //
2141   //
2142   // FP-> |                     |
2143   //      |---------------------|
2144   //      | 2 slots for moves   |
2145   //      |---------------------|
2146   //      | lock box (if sync)  |
2147   //      |---------------------| <- lock_slot_offset
2148   //      | klass (if static)   |
2149   //      |---------------------| <- klass_slot_offset
2150   //      | oopHandle area      |
2151   //      |---------------------| <- oop_handle_offset (6 java arg registers)
2152   //      | outbound memory     |
2153   //      | based arguments     |
2154   //      |                     |
2155   //      |---------------------|
2156   //      |                     |
2157   // SP-> | out_preserved_slots |
2158   //
2159   //
2160 
2161 
2162   // Now compute actual number of stack words we need rounding to make
2163   // stack properly aligned.
2164   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
2165 
2166   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
2167 
2168   // First thing make an ic check to see if we should even be here
2169 
2170   // We are free to use all registers as temps without saving them and
2171   // restoring them except rbp. rbp is the only callee save register
2172   // as far as the interpreter and the compiler(s) are concerned.
2173 
2174   const Register receiver = j_rarg0;
2175 
2176   Label exception_pending;
2177 
2178   assert_different_registers(receiver, rscratch1, rscratch2);
2179   __ verify_oop(receiver);
2180   __ ic_check(8 /* end_alignment */);
2181 
2182   int vep_offset = ((intptr_t)__ pc()) - start;
2183 
2184   if (method->needs_clinit_barrier()) {
2185     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
2186     Label L_skip_barrier;
2187     Register klass = r10;
2188     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
2189     __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
2190 
2191     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
2192 
2193     __ bind(L_skip_barrier);
2194   }
2195 
2196 #ifdef COMPILER1
2197   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2198   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2199     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2200   }
2201 #endif // COMPILER1
2202 
2203   // The instruction at the verified entry point must be 5 bytes or longer
2204   // because it can be patched on the fly by make_non_entrant. The stack bang
2205   // instruction fits that requirement.
2206 
2207   // Generate stack overflow check
2208   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
2209 
2210   // Generate a new frame for the wrapper.
2211   __ enter();
2212   // -2 because return address is already present and so is saved rbp
2213   __ subptr(rsp, stack_size - 2*wordSize);
2214 
2215   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2216   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
2217   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
2218 
2219   // Frame is now completed as far as size and linkage.
2220   int frame_complete = ((intptr_t)__ pc()) - start;
2221 
2222 #ifdef ASSERT
2223   __ check_stack_alignment(rsp, "improperly aligned stack");
2224 #endif /* ASSERT */
2225 
2226 
2227   // We use r14 as the oop handle for the receiver/klass
2228   // It is callee save so it survives the call to native
2229 
2230   const Register oop_handle_reg = r14;
2231 
2232   //
2233   // We immediately shuffle the arguments so that any vm call we have to
2234   // make from here on out (sync slow path, jvmti, etc.) we will have
2235   // captured the oops from our caller and have a valid oopMap for
2236   // them.
2237 
2238   // -----------------
2239   // The Grand Shuffle
2240 
2241   // The Java calling convention is either equal (linux) or denser (win64) than the
2242   // c calling convention. However the because of the jni_env argument the c calling
2243   // convention always has at least one more (and two for static) arguments than Java.
2244   // Therefore if we move the args from java -> c backwards then we will never have
2245   // a register->register conflict and we don't have to build a dependency graph
2246   // and figure out how to break any cycles.
2247   //
2248 
2249   // Record esp-based slot for receiver on stack for non-static methods
2250   int receiver_offset = -1;
2251 
2252   // This is a trick. We double the stack slots so we can claim
2253   // the oops in the caller's frame. Since we are sure to have
2254   // more args than the caller doubling is enough to make
2255   // sure we can capture all the incoming oop args from the
2256   // caller.
2257   //
2258   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2259 
2260   // Mark location of rbp (someday)
2261   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2262 
2263   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2264   // All inbound args are referenced based on rbp and all outbound args via rsp.
2265 
2266 
2267 #ifdef ASSERT
2268   bool reg_destroyed[Register::number_of_registers];
2269   bool freg_destroyed[XMMRegister::number_of_registers];
2270   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2271     reg_destroyed[r] = false;
2272   }
2273   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2274     freg_destroyed[f] = false;
2275   }
2276 
2277 #endif /* ASSERT */
2278 
2279   // For JNI natives the incoming and outgoing registers are offset upwards.
2280   GrowableArray<int> arg_order(2 * total_in_args);
2281 
2282   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2283     arg_order.push(i);
2284     arg_order.push(c_arg);
2285   }
2286 
2287   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2288     int i = arg_order.at(ai);
2289     int c_arg = arg_order.at(ai + 1);
2290     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2291 #ifdef ASSERT
2292     if (in_regs[i].first()->is_Register()) {
2293       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2294     } else if (in_regs[i].first()->is_XMMRegister()) {
2295       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2296     }
2297     if (out_regs[c_arg].first()->is_Register()) {
2298       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2299     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2300       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2301     }
2302 #endif /* ASSERT */
2303     switch (in_sig_bt[i]) {
2304       case T_ARRAY:
2305       case T_OBJECT:
2306         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2307                     ((i == 0) && (!is_static)),
2308                     &receiver_offset);
2309         break;
2310       case T_VOID:
2311         break;
2312 
2313       case T_FLOAT:
2314         __ float_move(in_regs[i], out_regs[c_arg]);
2315           break;
2316 
2317       case T_DOUBLE:
2318         assert( i + 1 < total_in_args &&
2319                 in_sig_bt[i + 1] == T_VOID &&
2320                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2321         __ double_move(in_regs[i], out_regs[c_arg]);
2322         break;
2323 
2324       case T_LONG :
2325         __ long_move(in_regs[i], out_regs[c_arg]);
2326         break;
2327 
2328       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2329 
2330       default:
2331         __ move32_64(in_regs[i], out_regs[c_arg]);
2332     }
2333   }
2334 
2335   int c_arg;
2336 
2337   // Pre-load a static method's oop into r14.  Used both by locking code and
2338   // the normal JNI call code.
2339   // point c_arg at the first arg that is already loaded in case we
2340   // need to spill before we call out
2341   c_arg = total_c_args - total_in_args;
2342 
2343   if (method->is_static()) {
2344 
2345     //  load oop into a register
2346     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2347 
2348     // Now handlize the static class mirror it's known not-null.
2349     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2350     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2351 
2352     // Now get the handle
2353     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2354     // store the klass handle as second argument
2355     __ movptr(c_rarg1, oop_handle_reg);
2356     // and protect the arg if we must spill
2357     c_arg--;
2358   }
2359 
2360   // Change state to native (we save the return address in the thread, since it might not
2361   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2362   // points into the right code segment. It does not have to be the correct return pc.
2363   // We use the same pc/oopMap repeatedly when we call out
2364 
2365   Label native_return;
2366   if (method->is_object_wait0()) {
2367     // For convenience we use the pc we want to resume to in case of preemption on Object.wait.
2368     __ set_last_Java_frame(rsp, noreg, native_return, rscratch1);
2369   } else {
2370     intptr_t the_pc = (intptr_t) __ pc();
2371     oop_maps->add_gc_map(the_pc - start, map);
2372 
2373     __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1);
2374   }
2375 
2376   // We have all of the arguments setup at this point. We must not touch any register
2377   // argument registers at this point (what if we save/restore them there are no oop?
2378 
2379   if (DTraceMethodProbes) {
2380     // protect the args we've loaded
2381     save_args(masm, total_c_args, c_arg, out_regs);
2382     __ mov_metadata(c_rarg1, method());
2383     __ call_VM_leaf(
2384       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2385       r15_thread, c_rarg1);
2386     restore_args(masm, total_c_args, c_arg, out_regs);
2387   }
2388 
2389   // RedefineClasses() tracing support for obsolete method entry
2390   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2391     // protect the args we've loaded
2392     save_args(masm, total_c_args, c_arg, out_regs);
2393     __ mov_metadata(c_rarg1, method());
2394     __ call_VM_leaf(
2395       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2396       r15_thread, c_rarg1);
2397     restore_args(masm, total_c_args, c_arg, out_regs);
2398   }
2399 
2400   // Lock a synchronized method
2401 
2402   // Register definitions used by locking and unlocking
2403 
2404   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2405   const Register obj_reg  = rbx;  // Will contain the oop
2406   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2407 
2408   Label slow_path_lock;
2409   Label lock_done;
2410 
2411   if (method->is_synchronized()) {
2412     // Get the handle (the 2nd argument)
2413     __ mov(oop_handle_reg, c_rarg1);
2414 
2415     // Get address of the box
2416 
2417     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2418 
2419     // Load the oop from the handle
2420     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2421 
2422     __ fast_lock(lock_reg, obj_reg, swap_reg, rscratch1, slow_path_lock);
2423 
2424     // Slow path will re-enter here
2425     __ bind(lock_done);
2426   }
2427 
2428   // Finally just about ready to make the JNI call
2429 
2430   // get JNIEnv* which is first argument to native
2431   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2432 
2433   // Now set thread in native
2434   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2435 
2436   __ call(RuntimeAddress(native_func));
2437 
2438   // Verify or restore cpu control state after JNI call
2439   __ restore_cpu_control_state_after_jni(rscratch1);
2440 
2441   // Unpack native results.
2442   switch (ret_type) {
2443   case T_BOOLEAN: __ c2bool(rax);            break;
2444   case T_CHAR   : __ movzwl(rax, rax);      break;
2445   case T_BYTE   : __ sign_extend_byte (rax); break;
2446   case T_SHORT  : __ sign_extend_short(rax); break;
2447   case T_INT    : /* nothing to do */        break;
2448   case T_DOUBLE :
2449   case T_FLOAT  :
2450     // Result is in xmm0 we'll save as needed
2451     break;
2452   case T_ARRAY:                 // Really a handle
2453   case T_OBJECT:                // Really a handle
2454       break; // can't de-handlize until after safepoint check
2455   case T_VOID: break;
2456   case T_LONG: break;
2457   default       : ShouldNotReachHere();
2458   }
2459 
2460   // Switch thread to "native transition" state before reading the synchronization state.
2461   // This additional state is necessary because reading and testing the synchronization
2462   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2463   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2464   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2465   //     Thread A is resumed to finish this native method, but doesn't block here since it
2466   //     didn't see any synchronization is progress, and escapes.
2467   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2468 
2469   // Force this write out before the read below
2470   if (!UseSystemMemoryBarrier) {
2471     __ membar(Assembler::Membar_mask_bits(
2472               Assembler::LoadLoad | Assembler::LoadStore |
2473               Assembler::StoreLoad | Assembler::StoreStore));
2474   }
2475 
2476   // check for safepoint operation in progress and/or pending suspend requests
2477   {
2478     Label Continue;
2479     Label slow_path;
2480 
2481     __ safepoint_poll(slow_path, true /* at_return */, false /* in_nmethod */);
2482 
2483     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2484     __ jcc(Assembler::equal, Continue);
2485     __ bind(slow_path);
2486 
2487     // Don't use call_VM as it will see a possible pending exception and forward it
2488     // and never return here preventing us from clearing _last_native_pc down below.
2489     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2490     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2491     // by hand.
2492     //
2493     __ vzeroupper();
2494     save_native_result(masm, ret_type, stack_slots);
2495     __ mov(c_rarg0, r15_thread);
2496     __ mov(r12, rsp); // remember sp
2497     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2498     __ andptr(rsp, -16); // align stack as required by ABI
2499     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2500     __ mov(rsp, r12); // restore sp
2501     __ reinit_heapbase();
2502     // Restore any method result value
2503     restore_native_result(masm, ret_type, stack_slots);
2504     __ bind(Continue);
2505   }
2506 
2507   // change thread state
2508   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2509 
2510   if (method->is_object_wait0()) {
2511     // Check preemption for Object.wait()
2512     __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset()));
2513     __ cmpptr(rscratch1, NULL_WORD);
2514     __ jccb(Assembler::equal, native_return);
2515     __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD);
2516     __ jmp(rscratch1);
2517     __ bind(native_return);
2518 
2519     intptr_t the_pc = (intptr_t) __ pc();
2520     oop_maps->add_gc_map(the_pc - start, map);
2521   }
2522 
2523 
2524   Label reguard;
2525   Label reguard_done;
2526   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2527   __ jcc(Assembler::equal, reguard);
2528   __ bind(reguard_done);
2529 
2530   // native result if any is live
2531 
2532   // Unlock
2533   Label slow_path_unlock;
2534   Label unlock_done;
2535   if (method->is_synchronized()) {
2536 
2537     Label fast_done;
2538 
2539     // Get locked oop from the handle we passed to jni
2540     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2541 
2542     // Must save rax if it is live now because cmpxchg must use it
2543     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2544       save_native_result(masm, ret_type, stack_slots);
2545     }
2546 
2547     __ fast_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2548 
2549     // slow path re-enters here
2550     __ bind(unlock_done);
2551     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2552       restore_native_result(masm, ret_type, stack_slots);
2553     }
2554 
2555     __ bind(fast_done);
2556   }
2557   if (DTraceMethodProbes) {
2558     save_native_result(masm, ret_type, stack_slots);
2559     __ mov_metadata(c_rarg1, method());
2560     __ call_VM_leaf(
2561          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2562          r15_thread, c_rarg1);
2563     restore_native_result(masm, ret_type, stack_slots);
2564   }
2565 
2566   __ reset_last_Java_frame(false);
2567 
2568   // Unbox oop result, e.g. JNIHandles::resolve value.
2569   if (is_reference_type(ret_type)) {
2570     __ resolve_jobject(rax /* value */,
2571                        rcx /* tmp */);
2572   }
2573 
2574   if (CheckJNICalls) {
2575     // clear_pending_jni_exception_check
2576     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2577   }
2578 
2579   // reset handle block
2580   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2581   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2582 
2583   // pop our frame
2584 
2585   __ leave();
2586 
2587 #if INCLUDE_JFR
2588   // We need to do a poll test after unwind in case the sampler
2589   // managed to sample the native frame after returning to Java.
2590   Label L_return;
2591   address poll_test_pc = __ pc();
2592   __ relocate(relocInfo::poll_return_type);
2593   __ testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2594   __ jccb(Assembler::zero, L_return);
2595   __ lea(rscratch1, InternalAddress(poll_test_pc));
2596   __ movptr(Address(r15_thread, JavaThread::saved_exception_pc_offset()), rscratch1);
2597   assert(SharedRuntime::polling_page_return_handler_blob() != nullptr,
2598     "polling page return stub not created yet");
2599   address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
2600   __ jump(RuntimeAddress(stub));
2601   __ bind(L_return);
2602 #endif // INCLUDE_JFR
2603 
2604   // Any exception pending?
2605   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2606   __ jcc(Assembler::notEqual, exception_pending);
2607 
2608   // Return
2609 
2610   __ ret(0);
2611 
2612   // Unexpected paths are out of line and go here
2613 
2614   // forward the exception
2615   __ bind(exception_pending);
2616 
2617   // and forward the exception
2618   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2619 
2620   // Slow path locking & unlocking
2621   if (method->is_synchronized()) {
2622 
2623     // BEGIN Slow path lock
2624     __ bind(slow_path_lock);
2625 
2626     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2627     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2628 
2629     // protect the args we've loaded
2630     save_args(masm, total_c_args, c_arg, out_regs);
2631 
2632     __ mov(c_rarg0, obj_reg);
2633     __ mov(c_rarg1, lock_reg);
2634     __ mov(c_rarg2, r15_thread);
2635 
2636     // Not a leaf but we have last_Java_frame setup as we want.
2637     // We don't want to unmount in case of contention since that would complicate preserving
2638     // the arguments that had already been marshalled into the native convention. So we force
2639     // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame())
2640     // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack.
2641     __ push_cont_fastpath();
2642     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2643     __ pop_cont_fastpath();
2644     restore_args(masm, total_c_args, c_arg, out_regs);
2645 
2646 #ifdef ASSERT
2647     { Label L;
2648     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2649     __ jcc(Assembler::equal, L);
2650     __ stop("no pending exception allowed on exit from monitorenter");
2651     __ bind(L);
2652     }
2653 #endif
2654     __ jmp(lock_done);
2655 
2656     // END Slow path lock
2657 
2658     // BEGIN Slow path unlock
2659     __ bind(slow_path_unlock);
2660 
2661     // If we haven't already saved the native result we must save it now as xmm registers
2662     // are still exposed.
2663     __ vzeroupper();
2664     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2665       save_native_result(masm, ret_type, stack_slots);
2666     }
2667 
2668     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2669 
2670     __ mov(c_rarg0, obj_reg);
2671     __ mov(c_rarg2, r15_thread);
2672     __ mov(r12, rsp); // remember sp
2673     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2674     __ andptr(rsp, -16); // align stack as required by ABI
2675 
2676     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2677     // NOTE that obj_reg == rbx currently
2678     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2679     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2680 
2681     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2682     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2683     __ mov(rsp, r12); // restore sp
2684     __ reinit_heapbase();
2685 #ifdef ASSERT
2686     {
2687       Label L;
2688       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2689       __ jcc(Assembler::equal, L);
2690       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2691       __ bind(L);
2692     }
2693 #endif /* ASSERT */
2694 
2695     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2696 
2697     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2698       restore_native_result(masm, ret_type, stack_slots);
2699     }
2700     __ jmp(unlock_done);
2701 
2702     // END Slow path unlock
2703 
2704   } // synchronized
2705 
2706   // SLOW PATH Reguard the stack if needed
2707 
2708   __ bind(reguard);
2709   __ vzeroupper();
2710   save_native_result(masm, ret_type, stack_slots);
2711   __ mov(r12, rsp); // remember sp
2712   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2713   __ andptr(rsp, -16); // align stack as required by ABI
2714   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2715   __ mov(rsp, r12); // restore sp
2716   __ reinit_heapbase();
2717   restore_native_result(masm, ret_type, stack_slots);
2718   // and continue
2719   __ jmp(reguard_done);
2720 
2721 
2722 
2723   __ flush();
2724 
2725   nmethod *nm = nmethod::new_native_nmethod(method,
2726                                             compile_id,
2727                                             masm->code(),
2728                                             vep_offset,
2729                                             frame_complete,
2730                                             stack_slots / VMRegImpl::slots_per_word,
2731                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2732                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2733                                             oop_maps);
2734 
2735   return nm;
2736 }
2737 
2738 // this function returns the adjust size (in number of words) to a c2i adapter
2739 // activation for use during deoptimization
2740 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2741   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2742 }
2743 
2744 
2745 uint SharedRuntime::out_preserve_stack_slots() {
2746   return 0;
2747 }
2748 
2749 
2750 // Number of stack slots between incoming argument block and the start of
2751 // a new frame.  The PROLOG must add this many slots to the stack.  The
2752 // EPILOG must remove this many slots.  amd64 needs two slots for
2753 // return address.
2754 uint SharedRuntime::in_preserve_stack_slots() {
2755   return 4 + 2 * VerifyStackAtCalls;
2756 }
2757 
2758 VMReg SharedRuntime::thread_register() {
2759   return r15_thread->as_VMReg();
2760 }
2761 
2762 //------------------------------generate_deopt_blob----------------------------
2763 void SharedRuntime::generate_deopt_blob() {
2764   // Allocate space for the code
2765   ResourceMark rm;
2766   // Setup code generation tools
2767   int pad = 0;
2768   if (UseAVX > 2) {
2769     pad += 1024;
2770   }
2771   if (UseAPX) {
2772     pad += 1024;
2773   }
2774 #if INCLUDE_JVMCI
2775   if (EnableJVMCI) {
2776     pad += 512; // Increase the buffer size when compiling for JVMCI
2777   }
2778 #endif
2779   const char* name = SharedRuntime::stub_name(StubId::shared_deopt_id);
2780   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
2781   if (blob != nullptr) {
2782     _deopt_blob = blob->as_deoptimization_blob();
2783     return;
2784   }
2785 
2786   CodeBuffer buffer(name, 2560+pad, 1024);
2787   MacroAssembler* masm = new MacroAssembler(&buffer);
2788   int frame_size_in_words;
2789   OopMap* map = nullptr;
2790   OopMapSet *oop_maps = new OopMapSet();
2791 
2792   // -------------
2793   // This code enters when returning to a de-optimized nmethod.  A return
2794   // address has been pushed on the stack, and return values are in
2795   // registers.
2796   // If we are doing a normal deopt then we were called from the patched
2797   // nmethod from the point we returned to the nmethod. So the return
2798   // address on the stack is wrong by NativeCall::instruction_size
2799   // We will adjust the value so it looks like we have the original return
2800   // address on the stack (like when we eagerly deoptimized).
2801   // In the case of an exception pending when deoptimizing, we enter
2802   // with a return address on the stack that points after the call we patched
2803   // into the exception handler. We have the following register state from,
2804   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2805   //    rax: exception oop
2806   //    rbx: exception handler
2807   //    rdx: throwing pc
2808   // So in this case we simply jam rdx into the useless return address and
2809   // the stack looks just like we want.
2810   //
2811   // At this point we need to de-opt.  We save the argument return
2812   // registers.  We call the first C routine, fetch_unroll_info().  This
2813   // routine captures the return values and returns a structure which
2814   // describes the current frame size and the sizes of all replacement frames.
2815   // The current frame is compiled code and may contain many inlined
2816   // functions, each with their own JVM state.  We pop the current frame, then
2817   // push all the new frames.  Then we call the C routine unpack_frames() to
2818   // populate these frames.  Finally unpack_frames() returns us the new target
2819   // address.  Notice that callee-save registers are BLOWN here; they have
2820   // already been captured in the vframeArray at the time the return PC was
2821   // patched.
2822   address start = __ pc();
2823   Label cont;
2824 
2825   // Prolog for non exception case!
2826 
2827   // Save everything in sight.
2828   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2829 
2830   // Normal deoptimization.  Save exec mode for unpack_frames.
2831   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2832   __ jmp(cont);
2833 
2834   int reexecute_offset = __ pc() - start;
2835 #if INCLUDE_JVMCI && !defined(COMPILER1)
2836   if (UseJVMCICompiler) {
2837     // JVMCI does not use this kind of deoptimization
2838     __ should_not_reach_here();
2839   }
2840 #endif
2841 
2842   // Reexecute case
2843   // return address is the pc describes what bci to do re-execute at
2844 
2845   // No need to update map as each call to save_live_registers will produce identical oopmap
2846   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2847 
2848   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2849   __ jmp(cont);
2850 
2851 #if INCLUDE_JVMCI
2852   Label after_fetch_unroll_info_call;
2853   int implicit_exception_uncommon_trap_offset = 0;
2854   int uncommon_trap_offset = 0;
2855 
2856   if (EnableJVMCI) {
2857     implicit_exception_uncommon_trap_offset = __ pc() - start;
2858 
2859     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2860     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2861 
2862     uncommon_trap_offset = __ pc() - start;
2863 
2864     // Save everything in sight.
2865     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2866     // fetch_unroll_info needs to call last_java_frame()
2867     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2868 
2869     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2870     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2871 
2872     __ movl(r14, Deoptimization::Unpack_reexecute);
2873     __ mov(c_rarg0, r15_thread);
2874     __ movl(c_rarg2, r14); // exec mode
2875     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2876     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2877 
2878     __ reset_last_Java_frame(false);
2879 
2880     __ jmp(after_fetch_unroll_info_call);
2881   } // EnableJVMCI
2882 #endif // INCLUDE_JVMCI
2883 
2884   int exception_offset = __ pc() - start;
2885 
2886   // Prolog for exception case
2887 
2888   // all registers are dead at this entry point, except for rax, and
2889   // rdx which contain the exception oop and exception pc
2890   // respectively.  Set them in TLS and fall thru to the
2891   // unpack_with_exception_in_tls entry point.
2892 
2893   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2894   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2895 
2896   int exception_in_tls_offset = __ pc() - start;
2897 
2898   // new implementation because exception oop is now passed in JavaThread
2899 
2900   // Prolog for exception case
2901   // All registers must be preserved because they might be used by LinearScan
2902   // Exceptiop oop and throwing PC are passed in JavaThread
2903   // tos: stack at point of call to method that threw the exception (i.e. only
2904   // args are on the stack, no return address)
2905 
2906   // make room on stack for the return address
2907   // It will be patched later with the throwing pc. The correct value is not
2908   // available now because loading it from memory would destroy registers.
2909   __ push(0);
2910 
2911   // Save everything in sight.
2912   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2913 
2914   // Now it is safe to overwrite any register
2915 
2916   // Deopt during an exception.  Save exec mode for unpack_frames.
2917   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2918 
2919   // load throwing pc from JavaThread and patch it as the return address
2920   // of the current frame. Then clear the field in JavaThread
2921 
2922   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2923   __ movptr(Address(rbp, wordSize), rdx);
2924   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2925 
2926 #ifdef ASSERT
2927   // verify that there is really an exception oop in JavaThread
2928   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2929   __ verify_oop(rax);
2930 
2931   // verify that there is no pending exception
2932   Label no_pending_exception;
2933   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2934   __ testptr(rax, rax);
2935   __ jcc(Assembler::zero, no_pending_exception);
2936   __ stop("must not have pending exception here");
2937   __ bind(no_pending_exception);
2938 #endif
2939 
2940   __ bind(cont);
2941 
2942   // Call C code.  Need thread and this frame, but NOT official VM entry
2943   // crud.  We cannot block on this call, no GC can happen.
2944   //
2945   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2946 
2947   // fetch_unroll_info needs to call last_java_frame().
2948 
2949   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2950 #ifdef ASSERT
2951   { Label L;
2952     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2953     __ jcc(Assembler::equal, L);
2954     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2955     __ bind(L);
2956   }
2957 #endif // ASSERT
2958   __ mov(c_rarg0, r15_thread);
2959   __ movl(c_rarg1, r14); // exec_mode
2960   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2961 
2962   // Need to have an oopmap that tells fetch_unroll_info where to
2963   // find any register it might need.
2964   oop_maps->add_gc_map(__ pc() - start, map);
2965 
2966   __ reset_last_Java_frame(false);
2967 
2968 #if INCLUDE_JVMCI
2969   if (EnableJVMCI) {
2970     __ bind(after_fetch_unroll_info_call);
2971   }
2972 #endif
2973 
2974   // Load UnrollBlock* into rdi
2975   __ mov(rdi, rax);
2976 
2977   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2978    Label noException;
2979   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2980   __ jcc(Assembler::notEqual, noException);
2981   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2982   // QQQ this is useless it was null above
2983   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2984   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2985   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2986 
2987   __ verify_oop(rax);
2988 
2989   // Overwrite the result registers with the exception results.
2990   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2991   // I think this is useless
2992   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2993 
2994   __ bind(noException);
2995 
2996   // Only register save data is on the stack.
2997   // Now restore the result registers.  Everything else is either dead
2998   // or captured in the vframeArray.
2999   RegisterSaver::restore_result_registers(masm);
3000 
3001   // All of the register save area has been popped of the stack. Only the
3002   // return address remains.
3003 
3004   // Pop all the frames we must move/replace.
3005   //
3006   // Frame picture (youngest to oldest)
3007   // 1: self-frame (no frame link)
3008   // 2: deopting frame  (no frame link)
3009   // 3: caller of deopting frame (could be compiled/interpreted).
3010   //
3011   // Note: by leaving the return address of self-frame on the stack
3012   // and using the size of frame 2 to adjust the stack
3013   // when we are done the return to frame 3 will still be on the stack.
3014 
3015   // Pop deoptimized frame
3016   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
3017   __ addptr(rsp, rcx);
3018 
3019   // rsp should be pointing at the return address to the caller (3)
3020 
3021   // Pick up the initial fp we should save
3022   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3023   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
3024 
3025 #ifdef ASSERT
3026   // Compilers generate code that bang the stack by as much as the
3027   // interpreter would need. So this stack banging should never
3028   // trigger a fault. Verify that it does not on non product builds.
3029   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
3030   __ bang_stack_size(rbx, rcx);
3031 #endif
3032 
3033   // Load address of array of frame pcs into rcx
3034   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
3035 
3036   // Trash the old pc
3037   __ addptr(rsp, wordSize);
3038 
3039   // Load address of array of frame sizes into rsi
3040   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
3041 
3042   // Load counter into rdx
3043   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
3044 
3045   // Now adjust the caller's stack to make up for the extra locals
3046   // but record the original sp so that we can save it in the skeletal interpreter
3047   // frame and the stack walking of interpreter_sender will get the unextended sp
3048   // value and not the "real" sp value.
3049 
3050   const Register sender_sp = r8;
3051 
3052   __ mov(sender_sp, rsp);
3053   __ movl(rbx, Address(rdi,
3054                        Deoptimization::UnrollBlock::
3055                        caller_adjustment_offset()));
3056   __ subptr(rsp, rbx);
3057 
3058   // Push interpreter frames in a loop
3059   Label loop;
3060   __ bind(loop);
3061   __ movptr(rbx, Address(rsi, 0));      // Load frame size
3062   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
3063   __ pushptr(Address(rcx, 0));          // Save return address
3064   __ enter();                           // Save old & set new ebp
3065   __ subptr(rsp, rbx);                  // Prolog
3066   // This value is corrected by layout_activation_impl
3067   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3068   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
3069   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
3070   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
3071   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
3072   __ decrementl(rdx);                   // Decrement counter
3073   __ jcc(Assembler::notZero, loop);
3074   __ pushptr(Address(rcx, 0));          // Save final return address
3075 
3076   // Re-push self-frame
3077   __ enter();                           // Save old & set new ebp
3078 
3079   // Allocate a full sized register save area.
3080   // Return address and rbp are in place, so we allocate two less words.
3081   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
3082 
3083   // Restore frame locals after moving the frame
3084   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
3085   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3086 
3087   // Call C code.  Need thread but NOT official VM entry
3088   // crud.  We cannot block on this call, no GC can happen.  Call should
3089   // restore return values to their stack-slots with the new SP.
3090   //
3091   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
3092 
3093   // Use rbp because the frames look interpreted now
3094   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3095   // Don't need the precise return PC here, just precise enough to point into this code blob.
3096   address the_pc = __ pc();
3097   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3098 
3099   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
3100   __ mov(c_rarg0, r15_thread);
3101   __ movl(c_rarg1, r14); // second arg: exec_mode
3102   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3103   // Revert SP alignment after call since we're going to do some SP relative addressing below
3104   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
3105 
3106   // Set an oopmap for the call site
3107   // Use the same PC we used for the last java frame
3108   oop_maps->add_gc_map(the_pc - start,
3109                        new OopMap( frame_size_in_words, 0 ));
3110 
3111   // Clear fp AND pc
3112   __ reset_last_Java_frame(true);
3113 
3114   // Collect return values
3115   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
3116   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
3117   // I think this is useless (throwing pc?)
3118   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
3119 
3120   // Pop self-frame.
3121   __ leave();                           // Epilog
3122 
3123   // Jump to interpreter
3124   __ ret(0);
3125 
3126   // Make sure all code is generated
3127   masm->flush();
3128 
3129   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
3130   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
3131 #if INCLUDE_JVMCI
3132   if (EnableJVMCI) {
3133     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
3134     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
3135   }
3136 #endif
3137 
3138   AOTCodeCache::store_code_blob(*_deopt_blob, AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
3139 }
3140 
3141 //------------------------------generate_handler_blob------
3142 //
3143 // Generate a special Compile2Runtime blob that saves all registers,
3144 // and setup oopmap.
3145 //
3146 SafepointBlob* SharedRuntime::generate_handler_blob(StubId id, address call_ptr) {
3147   assert(StubRoutines::forward_exception_entry() != nullptr,
3148          "must be generated before");
3149   assert(is_polling_page_id(id), "expected a polling page stub id");
3150 
3151   // Allocate space for the code.  Setup code generation tools.
3152   const char* name = SharedRuntime::stub_name(id);
3153   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3154   if (blob != nullptr) {
3155     return blob->as_safepoint_blob();
3156   }
3157 
3158   ResourceMark rm;
3159   OopMapSet *oop_maps = new OopMapSet();
3160   OopMap* map;
3161   CodeBuffer buffer(name, 2548, 1024);
3162   MacroAssembler* masm = new MacroAssembler(&buffer);
3163 
3164   address start   = __ pc();
3165   address call_pc = nullptr;
3166   int frame_size_in_words;
3167   bool cause_return = (id == StubId::shared_polling_page_return_handler_id);
3168   bool save_wide_vectors = (id == StubId::shared_polling_page_vectors_safepoint_handler_id);
3169 
3170   // Make room for return address (or push it again)
3171   if (!cause_return) {
3172     __ push(rbx);
3173   }
3174 
3175   // Save registers, fpu state, and flags
3176   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3177 
3178   // The following is basically a call_VM.  However, we need the precise
3179   // address of the call in order to generate an oopmap. Hence, we do all the
3180   // work ourselves.
3181 
3182   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3183 
3184   // The return address must always be correct so that frame constructor never
3185   // sees an invalid pc.
3186 
3187   if (!cause_return) {
3188     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3189     // Additionally, rbx is a callee saved register and we can look at it later to determine
3190     // if someone changed the return address for us!
3191     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3192     __ movptr(Address(rbp, wordSize), rbx);
3193   }
3194 
3195   // Do the call
3196   __ mov(c_rarg0, r15_thread);
3197   __ call(RuntimeAddress(call_ptr));
3198 
3199   // Set an oopmap for the call site.  This oopmap will map all
3200   // oop-registers and debug-info registers as callee-saved.  This
3201   // will allow deoptimization at this safepoint to find all possible
3202   // debug-info recordings, as well as let GC find all oops.
3203 
3204   oop_maps->add_gc_map( __ pc() - start, map);
3205 
3206   Label noException;
3207 
3208   __ reset_last_Java_frame(false);
3209 
3210   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3211   __ jcc(Assembler::equal, noException);
3212 
3213   // Exception pending
3214 
3215   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3216 
3217   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3218 
3219   // No exception case
3220   __ bind(noException);
3221 
3222   Label no_adjust;
3223 #ifdef ASSERT
3224   Label bail;
3225 #endif
3226   if (!cause_return) {
3227     Label no_prefix, not_special, check_rex_prefix;
3228 
3229     // If our stashed return pc was modified by the runtime we avoid touching it
3230     __ cmpptr(rbx, Address(rbp, wordSize));
3231     __ jcc(Assembler::notEqual, no_adjust);
3232 
3233     // Skip over the poll instruction.
3234     // See NativeInstruction::is_safepoint_poll()
3235     // Possible encodings:
3236     //      85 00       test   %eax,(%rax)
3237     //      85 01       test   %eax,(%rcx)
3238     //      85 02       test   %eax,(%rdx)
3239     //      85 03       test   %eax,(%rbx)
3240     //      85 06       test   %eax,(%rsi)
3241     //      85 07       test   %eax,(%rdi)
3242     //
3243     //   41 85 00       test   %eax,(%r8)
3244     //   41 85 01       test   %eax,(%r9)
3245     //   41 85 02       test   %eax,(%r10)
3246     //   41 85 03       test   %eax,(%r11)
3247     //   41 85 06       test   %eax,(%r14)
3248     //   41 85 07       test   %eax,(%r15)
3249     //
3250     //      85 04 24    test   %eax,(%rsp)
3251     //   41 85 04 24    test   %eax,(%r12)
3252     //      85 45 00    test   %eax,0x0(%rbp)
3253     //   41 85 45 00    test   %eax,0x0(%r13)
3254     //
3255     // Notes:
3256     //  Format of legacy MAP0 test instruction:-
3257     //  [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32]
3258     //  o  For safepoint polling instruction "test %eax,(%rax)", encoding of first register
3259     //     operand and base register of memory operand is b/w [0-8), hence we do not require
3260     //     additional REX prefix where REX.B bit stores MSB bit of register encoding, which
3261     //     is why two bytes encoding is sufficient here.
3262     //  o  For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE
3263     //     register of memory operand is 1000, thus we need additional REX prefix in this case,
3264     //     there by adding additional byte to instruction encoding.
3265     //  o  In case BASE register is one of the 32 extended GPR registers available only on targets
3266     //     supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold
3267     //     most significant two bits of 5 bit register encoding.
3268 
3269     if (VM_Version::supports_apx_f()) {
3270       __ cmpb(Address(rbx, 0), Assembler::REX2);
3271       __ jccb(Assembler::notEqual, check_rex_prefix);
3272       __ addptr(rbx, 2);
3273       __ bind(check_rex_prefix);
3274     }
3275     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3276     __ jccb(Assembler::notEqual, no_prefix);
3277     __ addptr(rbx, 1);
3278     __ bind(no_prefix);
3279 #ifdef ASSERT
3280     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3281 #endif
3282     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3283     // r12/rsp 0x04
3284     // r13/rbp 0x05
3285     __ movzbq(rcx, Address(rbx, 1));
3286     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3287     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3288     __ cmpptr(rcx, 1);
3289     __ jccb(Assembler::above, not_special);
3290     __ addptr(rbx, 1);
3291     __ bind(not_special);
3292 #ifdef ASSERT
3293     // Verify the correct encoding of the poll we're about to skip.
3294     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3295     __ jcc(Assembler::notEqual, bail);
3296     // Mask out the modrm bits
3297     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3298     // rax encodes to 0, so if the bits are nonzero it's incorrect
3299     __ jcc(Assembler::notZero, bail);
3300 #endif
3301     // Adjust return pc forward to step over the safepoint poll instruction
3302     __ addptr(rbx, 2);
3303     __ movptr(Address(rbp, wordSize), rbx);
3304   }
3305 
3306   __ bind(no_adjust);
3307   // Normal exit, restore registers and exit.
3308   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3309   __ ret(0);
3310 
3311 #ifdef ASSERT
3312   __ bind(bail);
3313   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3314 #endif
3315 
3316   // Make sure all code is generated
3317   masm->flush();
3318 
3319   // Fill-out other meta info
3320   SafepointBlob* sp_blob = SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3321 
3322   AOTCodeCache::store_code_blob(*sp_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3323   return sp_blob;
3324 }
3325 
3326 //
3327 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3328 //
3329 // Generate a stub that calls into vm to find out the proper destination
3330 // of a java call. All the argument registers are live at this point
3331 // but since this is generic code we don't know what they are and the caller
3332 // must do any gc of the args.
3333 //
3334 RuntimeStub* SharedRuntime::generate_resolve_blob(StubId id, address destination) {
3335   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3336   assert(is_resolve_id(id), "expected a resolve stub id");
3337 
3338   const char* name = SharedRuntime::stub_name(id);
3339   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3340   if (blob != nullptr) {
3341     return blob->as_runtime_stub();
3342   }
3343 
3344   // allocate space for the code
3345   ResourceMark rm;
3346   CodeBuffer buffer(name, 1552, 512);
3347   MacroAssembler* masm = new MacroAssembler(&buffer);
3348 
3349   int frame_size_in_words;
3350 
3351   OopMapSet *oop_maps = new OopMapSet();
3352   OopMap* map = nullptr;
3353 
3354   int start = __ offset();
3355 
3356   // No need to save vector registers since they are caller-saved anyway.
3357   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3358 
3359   int frame_complete = __ offset();
3360 
3361   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3362 
3363   __ mov(c_rarg0, r15_thread);
3364 
3365   __ call(RuntimeAddress(destination));
3366 
3367 
3368   // Set an oopmap for the call site.
3369   // We need this not only for callee-saved registers, but also for volatile
3370   // registers that the compiler might be keeping live across a safepoint.
3371 
3372   oop_maps->add_gc_map( __ offset() - start, map);
3373 
3374   // rax contains the address we are going to jump to assuming no exception got installed
3375 
3376   // clear last_Java_sp
3377   __ reset_last_Java_frame(false);
3378   // check for pending exceptions
3379   Label pending;
3380   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3381   __ jcc(Assembler::notEqual, pending);
3382 
3383   // get the returned Method*
3384   __ get_vm_result_metadata(rbx);
3385   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3386 
3387   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3388 
3389   RegisterSaver::restore_live_registers(masm);
3390 
3391   // We are back to the original state on entry and ready to go.
3392 
3393   __ jmp(rax);
3394 
3395   // Pending exception after the safepoint
3396 
3397   __ bind(pending);
3398 
3399   RegisterSaver::restore_live_registers(masm);
3400 
3401   // exception pending => remove activation and forward to exception handler
3402 
3403   __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
3404 
3405   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3406   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3407 
3408   // -------------
3409   // make sure all code is generated
3410   masm->flush();
3411 
3412   // return the  blob
3413   // frame_size_words or bytes??
3414   RuntimeStub* rs_blob = RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3415 
3416   AOTCodeCache::store_code_blob(*rs_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3417   return rs_blob;
3418 }
3419 
3420 // Continuation point for throwing of implicit exceptions that are
3421 // not handled in the current activation. Fabricates an exception
3422 // oop and initiates normal exception dispatching in this
3423 // frame. Since we need to preserve callee-saved values (currently
3424 // only for C2, but done for C1 as well) we need a callee-saved oop
3425 // map and therefore have to make these stubs into RuntimeStubs
3426 // rather than BufferBlobs.  If the compiler needs all registers to
3427 // be preserved between the fault point and the exception handler
3428 // then it must assume responsibility for that in
3429 // AbstractCompiler::continuation_for_implicit_null_exception or
3430 // continuation_for_implicit_division_by_zero_exception. All other
3431 // implicit exceptions (e.g., NullPointerException or
3432 // AbstractMethodError on entry) are either at call sites or
3433 // otherwise assume that stack unwinding will be initiated, so
3434 // caller saved registers were assumed volatile in the compiler.
3435 RuntimeStub* SharedRuntime::generate_throw_exception(StubId id, address runtime_entry) {
3436   assert(is_throw_id(id), "expected a throw stub id");
3437 
3438   const char* name = SharedRuntime::stub_name(id);
3439 
3440   // Information about frame layout at time of blocking runtime call.
3441   // Note that we only have to preserve callee-saved registers since
3442   // the compilers are responsible for supplying a continuation point
3443   // if they expect all registers to be preserved.
3444   enum layout {
3445     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3446     rbp_off2,
3447     return_off,
3448     return_off2,
3449     framesize // inclusive of return address
3450   };
3451 
3452   int insts_size = 512;
3453   int locs_size  = 64;
3454 
3455   const char* timer_msg = "SharedRuntime generate_throw_exception";
3456   TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3457 
3458   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3459   if (blob != nullptr) {
3460     return blob->as_runtime_stub();
3461   }
3462 
3463   ResourceMark rm;
3464   CodeBuffer code(name, insts_size, locs_size);
3465   OopMapSet* oop_maps  = new OopMapSet();
3466   MacroAssembler* masm = new MacroAssembler(&code);
3467 
3468   address start = __ pc();
3469 
3470   // This is an inlined and slightly modified version of call_VM
3471   // which has the ability to fetch the return PC out of
3472   // thread-local storage and also sets up last_Java_sp slightly
3473   // differently than the real call_VM
3474 
3475   __ enter(); // required for proper stackwalking of RuntimeStub frame
3476 
3477   assert(is_even(framesize/2), "sp not 16-byte aligned");
3478 
3479   // return address and rbp are already in place
3480   __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3481 
3482   int frame_complete = __ pc() - start;
3483 
3484   // Set up last_Java_sp and last_Java_fp
3485   address the_pc = __ pc();
3486   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3487   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3488 
3489   // Call runtime
3490   __ movptr(c_rarg0, r15_thread);
3491   BLOCK_COMMENT("call runtime_entry");
3492   __ call(RuntimeAddress(runtime_entry));
3493 
3494   // Generate oop map
3495   OopMap* map = new OopMap(framesize, 0);
3496 
3497   oop_maps->add_gc_map(the_pc - start, map);
3498 
3499   __ reset_last_Java_frame(true);
3500 
3501   __ leave(); // required for proper stackwalking of RuntimeStub frame
3502 
3503   // check for pending exceptions
3504 #ifdef ASSERT
3505   Label L;
3506   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3507   __ jcc(Assembler::notEqual, L);
3508   __ should_not_reach_here();
3509   __ bind(L);
3510 #endif // ASSERT
3511   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3512 
3513 
3514   // codeBlob framesize is in words (not VMRegImpl::slot_size)
3515   RuntimeStub* stub =
3516     RuntimeStub::new_runtime_stub(name,
3517                                   &code,
3518                                   frame_complete,
3519                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3520                                   oop_maps, false);
3521   AOTCodeCache::store_code_blob(*stub, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3522 
3523   return stub;
3524 }
3525 
3526 //------------------------------Montgomery multiplication------------------------
3527 //
3528 
3529 #ifndef _WINDOWS
3530 
3531 // Subtract 0:b from carry:a.  Return carry.
3532 static julong
3533 sub(julong a[], julong b[], julong carry, long len) {
3534   long long i = 0, cnt = len;
3535   julong tmp;
3536   asm volatile("clc; "
3537                "0: ; "
3538                "mov (%[b], %[i], 8), %[tmp]; "
3539                "sbb %[tmp], (%[a], %[i], 8); "
3540                "inc %[i]; dec %[cnt]; "
3541                "jne 0b; "
3542                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3543                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3544                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3545                : "memory");
3546   return tmp;
3547 }
3548 
3549 // Multiply (unsigned) Long A by Long B, accumulating the double-
3550 // length result into the accumulator formed of T0, T1, and T2.
3551 #define MACC(A, B, T0, T1, T2)                                  \
3552 do {                                                            \
3553   unsigned long hi, lo;                                         \
3554   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3555            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3556            : "r"(A), "a"(B) : "cc");                            \
3557  } while(0)
3558 
3559 // As above, but add twice the double-length result into the
3560 // accumulator.
3561 #define MACC2(A, B, T0, T1, T2)                                 \
3562 do {                                                            \
3563   unsigned long hi, lo;                                         \
3564   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3565            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3566            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3567            : "r"(A), "a"(B) : "cc");                            \
3568  } while(0)
3569 
3570 #else //_WINDOWS
3571 
3572 static julong
3573 sub(julong a[], julong b[], julong carry, long len) {
3574   long i;
3575   julong tmp;
3576   unsigned char c = 1;
3577   for (i = 0; i < len; i++) {
3578     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3579     a[i] = tmp;
3580   }
3581   c = _addcarry_u64(c, carry, ~0, &tmp);
3582   return tmp;
3583 }
3584 
3585 // Multiply (unsigned) Long A by Long B, accumulating the double-
3586 // length result into the accumulator formed of T0, T1, and T2.
3587 #define MACC(A, B, T0, T1, T2)                          \
3588 do {                                                    \
3589   julong hi, lo;                            \
3590   lo = _umul128(A, B, &hi);                             \
3591   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3592   c = _addcarry_u64(c, hi, T1, &T1);                    \
3593   _addcarry_u64(c, T2, 0, &T2);                         \
3594  } while(0)
3595 
3596 // As above, but add twice the double-length result into the
3597 // accumulator.
3598 #define MACC2(A, B, T0, T1, T2)                         \
3599 do {                                                    \
3600   julong hi, lo;                            \
3601   lo = _umul128(A, B, &hi);                             \
3602   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3603   c = _addcarry_u64(c, hi, T1, &T1);                    \
3604   _addcarry_u64(c, T2, 0, &T2);                         \
3605   c = _addcarry_u64(0, lo, T0, &T0);                    \
3606   c = _addcarry_u64(c, hi, T1, &T1);                    \
3607   _addcarry_u64(c, T2, 0, &T2);                         \
3608  } while(0)
3609 
3610 #endif //_WINDOWS
3611 
3612 // Fast Montgomery multiplication.  The derivation of the algorithm is
3613 // in  A Cryptographic Library for the Motorola DSP56000,
3614 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3615 
3616 static void NOINLINE
3617 montgomery_multiply(julong a[], julong b[], julong n[],
3618                     julong m[], julong inv, int len) {
3619   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3620   int i;
3621 
3622   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3623 
3624   for (i = 0; i < len; i++) {
3625     int j;
3626     for (j = 0; j < i; j++) {
3627       MACC(a[j], b[i-j], t0, t1, t2);
3628       MACC(m[j], n[i-j], t0, t1, t2);
3629     }
3630     MACC(a[i], b[0], t0, t1, t2);
3631     m[i] = t0 * inv;
3632     MACC(m[i], n[0], t0, t1, t2);
3633 
3634     assert(t0 == 0, "broken Montgomery multiply");
3635 
3636     t0 = t1; t1 = t2; t2 = 0;
3637   }
3638 
3639   for (i = len; i < 2*len; i++) {
3640     int j;
3641     for (j = i-len+1; j < len; j++) {
3642       MACC(a[j], b[i-j], t0, t1, t2);
3643       MACC(m[j], n[i-j], t0, t1, t2);
3644     }
3645     m[i-len] = t0;
3646     t0 = t1; t1 = t2; t2 = 0;
3647   }
3648 
3649   while (t0)
3650     t0 = sub(m, n, t0, len);
3651 }
3652 
3653 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3654 // multiplies so it should be up to 25% faster than Montgomery
3655 // multiplication.  However, its loop control is more complex and it
3656 // may actually run slower on some machines.
3657 
3658 static void NOINLINE
3659 montgomery_square(julong a[], julong n[],
3660                   julong m[], julong inv, int len) {
3661   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3662   int i;
3663 
3664   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3665 
3666   for (i = 0; i < len; i++) {
3667     int j;
3668     int end = (i+1)/2;
3669     for (j = 0; j < end; j++) {
3670       MACC2(a[j], a[i-j], t0, t1, t2);
3671       MACC(m[j], n[i-j], t0, t1, t2);
3672     }
3673     if ((i & 1) == 0) {
3674       MACC(a[j], a[j], t0, t1, t2);
3675     }
3676     for (; j < i; j++) {
3677       MACC(m[j], n[i-j], t0, t1, t2);
3678     }
3679     m[i] = t0 * inv;
3680     MACC(m[i], n[0], t0, t1, t2);
3681 
3682     assert(t0 == 0, "broken Montgomery square");
3683 
3684     t0 = t1; t1 = t2; t2 = 0;
3685   }
3686 
3687   for (i = len; i < 2*len; i++) {
3688     int start = i-len+1;
3689     int end = start + (len - start)/2;
3690     int j;
3691     for (j = start; j < end; j++) {
3692       MACC2(a[j], a[i-j], t0, t1, t2);
3693       MACC(m[j], n[i-j], t0, t1, t2);
3694     }
3695     if ((i & 1) == 0) {
3696       MACC(a[j], a[j], t0, t1, t2);
3697     }
3698     for (; j < len; j++) {
3699       MACC(m[j], n[i-j], t0, t1, t2);
3700     }
3701     m[i-len] = t0;
3702     t0 = t1; t1 = t2; t2 = 0;
3703   }
3704 
3705   while (t0)
3706     t0 = sub(m, n, t0, len);
3707 }
3708 
3709 // Swap words in a longword.
3710 static julong swap(julong x) {
3711   return (x << 32) | (x >> 32);
3712 }
3713 
3714 // Copy len longwords from s to d, word-swapping as we go.  The
3715 // destination array is reversed.
3716 static void reverse_words(julong *s, julong *d, int len) {
3717   d += len;
3718   while(len-- > 0) {
3719     d--;
3720     *d = swap(*s);
3721     s++;
3722   }
3723 }
3724 
3725 // The threshold at which squaring is advantageous was determined
3726 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3727 #define MONTGOMERY_SQUARING_THRESHOLD 64
3728 
3729 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3730                                         jint len, jlong inv,
3731                                         jint *m_ints) {
3732   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3733   int longwords = len/2;
3734 
3735   // Make very sure we don't use so much space that the stack might
3736   // overflow.  512 jints corresponds to an 16384-bit integer and
3737   // will use here a total of 8k bytes of stack space.
3738   int divisor = sizeof(julong) * 4;
3739   guarantee(longwords <= 8192 / divisor, "must be");
3740   int total_allocation = longwords * sizeof (julong) * 4;
3741   julong *scratch = (julong *)alloca(total_allocation);
3742 
3743   // Local scratch arrays
3744   julong
3745     *a = scratch + 0 * longwords,
3746     *b = scratch + 1 * longwords,
3747     *n = scratch + 2 * longwords,
3748     *m = scratch + 3 * longwords;
3749 
3750   reverse_words((julong *)a_ints, a, longwords);
3751   reverse_words((julong *)b_ints, b, longwords);
3752   reverse_words((julong *)n_ints, n, longwords);
3753 
3754   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3755 
3756   reverse_words(m, (julong *)m_ints, longwords);
3757 }
3758 
3759 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3760                                       jint len, jlong inv,
3761                                       jint *m_ints) {
3762   assert(len % 2 == 0, "array length in montgomery_square must be even");
3763   int longwords = len/2;
3764 
3765   // Make very sure we don't use so much space that the stack might
3766   // overflow.  512 jints corresponds to an 16384-bit integer and
3767   // will use here a total of 6k bytes of stack space.
3768   int divisor = sizeof(julong) * 3;
3769   guarantee(longwords <= (8192 / divisor), "must be");
3770   int total_allocation = longwords * sizeof (julong) * 3;
3771   julong *scratch = (julong *)alloca(total_allocation);
3772 
3773   // Local scratch arrays
3774   julong
3775     *a = scratch + 0 * longwords,
3776     *n = scratch + 1 * longwords,
3777     *m = scratch + 2 * longwords;
3778 
3779   reverse_words((julong *)a_ints, a, longwords);
3780   reverse_words((julong *)n_ints, n, longwords);
3781 
3782   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3783     ::montgomery_square(a, n, m, (julong)inv, longwords);
3784   } else {
3785     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3786   }
3787 
3788   reverse_words(m, (julong *)m_ints, longwords);
3789 }
3790 
3791 BufferedInlineTypeBlob* SharedRuntime::generate_buffered_inline_type_adapter(const InlineKlass* vk) {
3792   BufferBlob* buf = BufferBlob::create("inline types pack/unpack", 16 * K);
3793   if (buf == nullptr) {
3794     return nullptr;
3795   }
3796   CodeBuffer buffer(buf);
3797   short buffer_locs[20];
3798   buffer.insts()->initialize_shared_locs((relocInfo*)buffer_locs,
3799                                          sizeof(buffer_locs)/sizeof(relocInfo));
3800 
3801   MacroAssembler* masm = new MacroAssembler(&buffer);
3802 
3803   const Array<SigEntry>* sig_vk = vk->extended_sig();
3804   const Array<VMRegPair>* regs = vk->return_regs();
3805 
3806   int pack_fields_jobject_off = __ offset();
3807   // Resolve pre-allocated buffer from JNI handle.
3808   // We cannot do this in generate_call_stub() because it requires GC code to be initialized.
3809   __ movptr(rax, Address(r13, 0));
3810   __ resolve_jobject(rax /* value */,
3811                      r12 /* tmp */);
3812   __ movptr(Address(r13, 0), rax);
3813 
3814   int pack_fields_off = __ offset();
3815 
3816   int j = 1;
3817   for (int i = 0; i < sig_vk->length(); i++) {
3818     BasicType bt = sig_vk->at(i)._bt;
3819     if (bt == T_METADATA) {
3820       continue;
3821     }
3822     if (bt == T_VOID) {
3823       if (sig_vk->at(i-1)._bt == T_LONG ||
3824           sig_vk->at(i-1)._bt == T_DOUBLE) {
3825         j++;
3826       }
3827       continue;
3828     }
3829     int off = sig_vk->at(i)._offset;
3830     assert(off > 0, "offset in object should be positive");
3831     VMRegPair pair = regs->at(j);
3832     VMReg r_1 = pair.first();
3833     Address to(rax, off);
3834     if (bt == T_FLOAT) {
3835       __ movflt(to, r_1->as_XMMRegister());
3836     } else if (bt == T_DOUBLE) {
3837       __ movdbl(to, r_1->as_XMMRegister());
3838     } else {
3839       Register val = r_1->as_Register();
3840       assert_different_registers(to.base(), val, r14, r13, rbx, rscratch1);
3841       if (is_reference_type(bt)) {
3842         // store_heap_oop transitively calls oop_store_at which corrupts to.base(). We need to keep rax valid.
3843         __ mov(rbx, rax);
3844         Address to_with_rbx(rbx, off);
3845         __ store_heap_oop(to_with_rbx, val, r14, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
3846       } else {
3847         __ store_sized_value(to, r_1->as_Register(), type2aelembytes(bt));
3848       }
3849     }
3850     j++;
3851   }
3852   assert(j == regs->length(), "missed a field?");
3853   if (vk->supports_nullable_layouts()) {
3854     // Set the null marker
3855     __ movb(Address(rax, vk->null_marker_offset()), 1);
3856   }
3857   __ ret(0);
3858 
3859   int unpack_fields_off = __ offset();
3860 
3861   Label skip;
3862   Label not_null;
3863   __ testptr(rax, rax);
3864   __ jcc(Assembler::notZero, not_null);
3865 
3866   // Return value is null. Zero all registers because the runtime requires a canonical
3867   // representation of a flat null.
3868   j = 1;
3869   for (int i = 0; i < sig_vk->length(); i++) {
3870     BasicType bt = sig_vk->at(i)._bt;
3871     if (bt == T_METADATA) {
3872       continue;
3873     }
3874     if (bt == T_VOID) {
3875       if (sig_vk->at(i-1)._bt == T_LONG ||
3876           sig_vk->at(i-1)._bt == T_DOUBLE) {
3877         j++;
3878       }
3879       continue;
3880     }
3881 
3882     VMRegPair pair = regs->at(j);
3883     VMReg r_1 = pair.first();
3884     if (r_1->is_XMMRegister()) {
3885       __ xorps(r_1->as_XMMRegister(), r_1->as_XMMRegister());
3886     } else {
3887       __ xorl(r_1->as_Register(), r_1->as_Register());
3888     }
3889     j++;
3890   }
3891   __ jmp(skip);
3892   __ bind(not_null);
3893 
3894   j = 1;
3895   for (int i = 0; i < sig_vk->length(); i++) {
3896     BasicType bt = sig_vk->at(i)._bt;
3897     if (bt == T_METADATA) {
3898       continue;
3899     }
3900     if (bt == T_VOID) {
3901       if (sig_vk->at(i-1)._bt == T_LONG ||
3902           sig_vk->at(i-1)._bt == T_DOUBLE) {
3903         j++;
3904       }
3905       continue;
3906     }
3907     int off = sig_vk->at(i)._offset;
3908     assert(off > 0, "offset in object should be positive");
3909     VMRegPair pair = regs->at(j);
3910     VMReg r_1 = pair.first();
3911     VMReg r_2 = pair.second();
3912     Address from(rax, off);
3913     if (bt == T_FLOAT) {
3914       __ movflt(r_1->as_XMMRegister(), from);
3915     } else if (bt == T_DOUBLE) {
3916       __ movdbl(r_1->as_XMMRegister(), from);
3917     } else if (bt == T_OBJECT || bt == T_ARRAY) {
3918       assert_different_registers(rax, r_1->as_Register());
3919       __ load_heap_oop(r_1->as_Register(), from);
3920     } else {
3921       assert(is_java_primitive(bt), "unexpected basic type");
3922       assert_different_registers(rax, r_1->as_Register());
3923       size_t size_in_bytes = type2aelembytes(bt);
3924       __ load_sized_value(r_1->as_Register(), from, size_in_bytes, bt != T_CHAR && bt != T_BOOLEAN);
3925     }
3926     j++;
3927   }
3928   assert(j == regs->length(), "missed a field?");
3929 
3930   __ bind(skip);
3931   __ ret(0);
3932 
3933   __ flush();
3934 
3935   return BufferedInlineTypeBlob::create(&buffer, pack_fields_off, pack_fields_jobject_off, unpack_fields_off);
3936 }
3937 
3938 #if INCLUDE_JFR
3939 
3940 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
3941 // It returns a jobject handle to the event writer.
3942 // The handle is dereferenced and the return value is the event writer oop.
3943 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
3944   enum layout {
3945     rbp_off,
3946     rbpH_off,
3947     return_off,
3948     return_off2,
3949     framesize // inclusive of return address
3950   };
3951 
3952   const char* name = SharedRuntime::stub_name(StubId::shared_jfr_write_checkpoint_id);
3953   CodeBuffer code(name, 1024, 64);
3954   MacroAssembler* masm = new MacroAssembler(&code);
3955   address start = __ pc();
3956 
3957   __ enter();
3958   address the_pc = __ pc();
3959 
3960   int frame_complete = the_pc - start;
3961 
3962   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3963   __ movptr(c_rarg0, r15_thread);
3964   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
3965   __ reset_last_Java_frame(true);
3966 
3967   // rax is jobject handle result, unpack and process it through a barrier.
3968   __ resolve_global_jobject(rax, c_rarg0);
3969 
3970   __ leave();
3971   __ ret(0);
3972 
3973   OopMapSet* oop_maps = new OopMapSet();
3974   OopMap* map = new OopMap(framesize, 1);
3975   oop_maps->add_gc_map(frame_complete, map);
3976 
3977   RuntimeStub* stub =
3978     RuntimeStub::new_runtime_stub(name,
3979                                   &code,
3980                                   frame_complete,
3981                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3982                                   oop_maps,
3983                                   false);
3984   return stub;
3985 }
3986 
3987 // For c2: call to return a leased buffer.
3988 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
3989   enum layout {
3990     rbp_off,
3991     rbpH_off,
3992     return_off,
3993     return_off2,
3994     framesize // inclusive of return address
3995   };
3996 
3997   const char* name = SharedRuntime::stub_name(StubId::shared_jfr_return_lease_id);
3998   CodeBuffer code(name, 1024, 64);
3999   MacroAssembler* masm = new MacroAssembler(&code);
4000   address start = __ pc();
4001 
4002   __ enter();
4003   address the_pc = __ pc();
4004 
4005   int frame_complete = the_pc - start;
4006 
4007   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
4008   __ movptr(c_rarg0, r15_thread);
4009   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
4010   __ reset_last_Java_frame(true);
4011 
4012   __ leave();
4013   __ ret(0);
4014 
4015   OopMapSet* oop_maps = new OopMapSet();
4016   OopMap* map = new OopMap(framesize, 1);
4017   oop_maps->add_gc_map(frame_complete, map);
4018 
4019   RuntimeStub* stub =
4020     RuntimeStub::new_runtime_stub(name,
4021                                   &code,
4022                                   frame_complete,
4023                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4024                                   oop_maps,
4025                                   false);
4026   return stub;
4027 }
4028 
4029 #endif // INCLUDE_JFR