1 /*
   2  * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef _WINDOWS
  26 #include "alloca.h"
  27 #endif
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "classfile/symbolTable.hpp"
  31 #include "code/aotCodeCache.hpp"
  32 #include "code/compiledIC.hpp"
  33 #include "code/debugInfoRec.hpp"
  34 #include "code/nativeInst.hpp"
  35 #include "code/vtableStubs.hpp"
  36 #include "compiler/oopMap.hpp"
  37 #include "gc/shared/collectedHeap.hpp"
  38 #include "gc/shared/gcLocker.hpp"
  39 #include "gc/shared/barrierSet.hpp"
  40 #include "gc/shared/barrierSetAssembler.hpp"
  41 #include "interpreter/interpreter.hpp"
  42 #include "logging/log.hpp"
  43 #include "memory/resourceArea.hpp"
  44 #include "memory/universe.hpp"
  45 #include "oops/klass.inline.hpp"
  46 #include "oops/method.inline.hpp"
  47 #include "prims/methodHandles.hpp"
  48 #include "runtime/continuation.hpp"
  49 #include "runtime/continuationEntry.inline.hpp"
  50 #include "runtime/globals.hpp"
  51 #include "runtime/jniHandles.hpp"
  52 #include "runtime/safepointMechanism.hpp"
  53 #include "runtime/sharedRuntime.hpp"
  54 #include "runtime/signature.hpp"
  55 #include "runtime/stubRoutines.hpp"
  56 #include "runtime/timerTrace.hpp"
  57 #include "runtime/vframeArray.hpp"
  58 #include "runtime/vm_version.hpp"
  59 #include "utilities/align.hpp"
  60 #include "utilities/checkedCast.hpp"
  61 #include "utilities/formatBuffer.hpp"
  62 #include "vmreg_x86.inline.hpp"
  63 #ifdef COMPILER1
  64 #include "c1/c1_Runtime1.hpp"
  65 #endif
  66 #ifdef COMPILER2
  67 #include "opto/runtime.hpp"
  68 #endif
  69 #if INCLUDE_JVMCI
  70 #include "jvmci/jvmciJavaClasses.hpp"
  71 #endif
  72 
  73 #define __ masm->
  74 
  75 #ifdef PRODUCT
  76 #define BLOCK_COMMENT(str) /* nothing */
  77 #else
  78 #define BLOCK_COMMENT(str) __ block_comment(str)
  79 #endif // PRODUCT
  80 
  81 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  82 
  83 class RegisterSaver {
  84   // Capture info about frame layout.  Layout offsets are in jint
  85   // units because compiler frame slots are jints.
  86 #define XSAVE_AREA_BEGIN 160
  87 #define XSAVE_AREA_YMM_BEGIN 576
  88 #define XSAVE_AREA_EGPRS 960
  89 #define XSAVE_AREA_OPMASK_BEGIN 1088
  90 #define XSAVE_AREA_ZMM_BEGIN 1152
  91 #define XSAVE_AREA_UPPERBANK 1664
  92 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  93 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  94 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
  95 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
  96 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
  97   enum layout {
  98     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt,    // fxsave save area
  99     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,  // offset in fxsave save area
 100     DEF_XMM_OFFS(0),
 101     DEF_XMM_OFFS(1),
 102     // 2..15 are implied in range usage
 103     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 104     DEF_YMM_OFFS(0),
 105     DEF_YMM_OFFS(1),
 106     r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
 107     r16H_off,
 108     r17_off, r17H_off,
 109     r18_off, r18H_off,
 110     r19_off, r19H_off,
 111     r20_off, r20H_off,
 112     r21_off, r21H_off,
 113     r22_off, r22H_off,
 114     r23_off, r23H_off,
 115     r24_off, r24H_off,
 116     r25_off, r25H_off,
 117     r26_off, r26H_off,
 118     r27_off, r27H_off,
 119     r28_off, r28H_off,
 120     r29_off, r29H_off,
 121     r30_off, r30H_off,
 122     r31_off, r31H_off,
 123     opmask_off   = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 124     DEF_OPMASK_OFFS(0),
 125     DEF_OPMASK_OFFS(1),
 126     // 2..7 are implied in range usage
 127     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 128     DEF_ZMM_OFFS(0),
 129     DEF_ZMM_OFFS(1),
 130     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 131     DEF_ZMM_UPPER_OFFS(16),
 132     DEF_ZMM_UPPER_OFFS(17),
 133     // 18..31 are implied in range usage
 134     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 135     fpu_stateH_end,
 136     r15_off, r15H_off,
 137     r14_off, r14H_off,
 138     r13_off, r13H_off,
 139     r12_off, r12H_off,
 140     r11_off, r11H_off,
 141     r10_off, r10H_off,
 142     r9_off,  r9H_off,
 143     r8_off,  r8H_off,
 144     rdi_off, rdiH_off,
 145     rsi_off, rsiH_off,
 146     ignore_off, ignoreH_off,  // extra copy of rbp
 147     rsp_off, rspH_off,
 148     rbx_off, rbxH_off,
 149     rdx_off, rdxH_off,
 150     rcx_off, rcxH_off,
 151     rax_off, raxH_off,
 152     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 153     align_off, alignH_off,
 154     flags_off, flagsH_off,
 155     // The frame sender code expects that rbp will be in the "natural" place and
 156     // will override any oopMap setting for it. We must therefore force the layout
 157     // so that it agrees with the frame sender code.
 158     rbp_off, rbpH_off,        // copy of rbp we will restore
 159     return_off, returnH_off,  // slot for return address
 160     reg_save_size             // size in compiler stack slots
 161   };
 162 
 163  public:
 164   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 165   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 166 
 167   // Offsets into the register save area
 168   // Used by deoptimization when it is managing result register
 169   // values on its own
 170 
 171   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 172   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 173   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 174   static int r15_offset_in_bytes(void)    { return BytesPerInt * r15_off; }
 175   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 176   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 177 
 178   // During deoptimization only the result registers need to be restored,
 179   // all the other values have already been extracted.
 180   static void restore_result_registers(MacroAssembler* masm);
 181 };
 182 
 183 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 184   int off = 0;
 185   int num_xmm_regs = XMMRegister::available_xmm_registers();
 186 #if COMPILER2_OR_JVMCI
 187   if (save_wide_vectors && UseAVX == 0) {
 188     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 189   }
 190   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 191 #else
 192   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 193 #endif
 194 
 195   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 196   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 197   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 198   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 199   // CodeBlob frame size is in words.
 200   int frame_size_in_words = frame_size_in_bytes / wordSize;
 201   *total_frame_words = frame_size_in_words;
 202 
 203   // Save registers, fpu state, and flags.
 204   // We assume caller has already pushed the return address onto the
 205   // stack, so rsp is 8-byte aligned here.
 206   // We push rpb twice in this sequence because we want the real rbp
 207   // to be under the return like a normal enter.
 208 
 209   __ enter();          // rsp becomes 16-byte aligned here
 210   __ pushf();
 211   // Make sure rsp stays 16-byte aligned
 212   __ subq(rsp, 8);
 213   // Push CPU state in multiple of 16 bytes
 214   __ save_legacy_gprs();
 215   __ push_FPU_state();
 216 
 217 
 218   // push cpu state handles this on EVEX enabled targets
 219   if (save_wide_vectors) {
 220     // Save upper half of YMM registers(0..15)
 221     int base_addr = XSAVE_AREA_YMM_BEGIN;
 222     for (int n = 0; n < 16; n++) {
 223       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 224     }
 225     if (VM_Version::supports_evex()) {
 226       // Save upper half of ZMM registers(0..15)
 227       base_addr = XSAVE_AREA_ZMM_BEGIN;
 228       for (int n = 0; n < 16; n++) {
 229         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 230       }
 231       // Save full ZMM registers(16..num_xmm_regs)
 232       base_addr = XSAVE_AREA_UPPERBANK;
 233       off = 0;
 234       int vector_len = Assembler::AVX_512bit;
 235       for (int n = 16; n < num_xmm_regs; n++) {
 236         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 237       }
 238 #if COMPILER2_OR_JVMCI
 239       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 240       off = 0;
 241       for(int n = 0; n < KRegister::number_of_registers; n++) {
 242         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 243       }
 244 #endif
 245     }
 246   } else {
 247     if (VM_Version::supports_evex()) {
 248       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 249       int base_addr = XSAVE_AREA_UPPERBANK;
 250       off = 0;
 251       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 252       for (int n = 16; n < num_xmm_regs; n++) {
 253         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 254       }
 255 #if COMPILER2_OR_JVMCI
 256       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 257       off = 0;
 258       for(int n = 0; n < KRegister::number_of_registers; n++) {
 259         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 260       }
 261 #endif
 262     }
 263   }
 264 
 265 #if COMPILER2_OR_JVMCI
 266   if (UseAPX) {
 267       int base_addr = XSAVE_AREA_EGPRS;
 268       off = 0;
 269       for (int n = 16; n < Register::number_of_registers; n++) {
 270         __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
 271       }
 272   }
 273 #endif
 274 
 275   __ vzeroupper();
 276   if (frame::arg_reg_save_area_bytes != 0) {
 277     // Allocate argument register save area
 278     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 279   }
 280 
 281   // Set an oopmap for the call site.  This oopmap will map all
 282   // oop-registers and debug-info registers as callee-saved.  This
 283   // will allow deoptimization at this safepoint to find all possible
 284   // debug-info recordings, as well as let GC find all oops.
 285 
 286   OopMapSet *oop_maps = new OopMapSet();
 287   OopMap* map = new OopMap(frame_size_in_slots, 0);
 288 
 289 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 290 
 291   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 292   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 293   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 294   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 295   // rbp location is known implicitly by the frame sender code, needs no oopmap
 296   // and the location where rbp was saved by is ignored
 297   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 298   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 299   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 300   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 301   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 302   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 303   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 304   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 305   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 306   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 307 
 308   if (UseAPX) {
 309     map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
 310     map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
 311     map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
 312     map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
 313     map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
 314     map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
 315     map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
 316     map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
 317     map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
 318     map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
 319     map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
 320     map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
 321     map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
 322     map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
 323     map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
 324     map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
 325   }
 326   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 327   // on EVEX enabled targets, we get it included in the xsave area
 328   off = xmm0_off;
 329   int delta = xmm1_off - off;
 330   for (int n = 0; n < 16; n++) {
 331     XMMRegister xmm_name = as_XMMRegister(n);
 332     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 333     off += delta;
 334   }
 335   if (UseAVX > 2) {
 336     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 337     off = zmm16_off;
 338     delta = zmm17_off - off;
 339     for (int n = 16; n < num_xmm_regs; n++) {
 340       XMMRegister zmm_name = as_XMMRegister(n);
 341       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 342       off += delta;
 343     }
 344   }
 345 
 346 #if COMPILER2_OR_JVMCI
 347   if (save_wide_vectors) {
 348     // Save upper half of YMM registers(0..15)
 349     off = ymm0_off;
 350     delta = ymm1_off - ymm0_off;
 351     for (int n = 0; n < 16; n++) {
 352       XMMRegister ymm_name = as_XMMRegister(n);
 353       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 354       off += delta;
 355     }
 356     if (VM_Version::supports_evex()) {
 357       // Save upper half of ZMM registers(0..15)
 358       off = zmm0_off;
 359       delta = zmm1_off - zmm0_off;
 360       for (int n = 0; n < 16; n++) {
 361         XMMRegister zmm_name = as_XMMRegister(n);
 362         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 363         off += delta;
 364       }
 365     }
 366   }
 367 #endif // COMPILER2_OR_JVMCI
 368 
 369   // %%% These should all be a waste but we'll keep things as they were for now
 370   if (true) {
 371     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 372     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 373     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 374     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 375     // rbp location is known implicitly by the frame sender code, needs no oopmap
 376     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 377     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 378     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 379     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 380     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 381     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 382     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 383     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 384     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 385     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 386     if (UseAPX) {
 387       map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
 388       map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
 389       map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
 390       map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
 391       map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
 392       map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
 393       map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
 394       map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
 395       map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
 396       map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
 397       map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
 398       map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
 399       map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
 400       map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
 401       map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
 402       map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
 403     }
 404     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 405     // on EVEX enabled targets, we get it included in the xsave area
 406     off = xmm0H_off;
 407     delta = xmm1H_off - off;
 408     for (int n = 0; n < 16; n++) {
 409       XMMRegister xmm_name = as_XMMRegister(n);
 410       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 411       off += delta;
 412     }
 413     if (UseAVX > 2) {
 414       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 415       off = zmm16H_off;
 416       delta = zmm17H_off - off;
 417       for (int n = 16; n < num_xmm_regs; n++) {
 418         XMMRegister zmm_name = as_XMMRegister(n);
 419         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 420         off += delta;
 421       }
 422     }
 423   }
 424 
 425   return map;
 426 }
 427 
 428 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 429   int num_xmm_regs = XMMRegister::available_xmm_registers();
 430   if (frame::arg_reg_save_area_bytes != 0) {
 431     // Pop arg register save area
 432     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 433   }
 434 
 435 #if COMPILER2_OR_JVMCI
 436   if (restore_wide_vectors) {
 437     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 438     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 439   }
 440 #else
 441   assert(!restore_wide_vectors, "vectors are generated only by C2");
 442 #endif
 443 
 444   __ vzeroupper();
 445 
 446   // On EVEX enabled targets everything is handled in pop fpu state
 447   if (restore_wide_vectors) {
 448     // Restore upper half of YMM registers (0..15)
 449     int base_addr = XSAVE_AREA_YMM_BEGIN;
 450     for (int n = 0; n < 16; n++) {
 451       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 452     }
 453     if (VM_Version::supports_evex()) {
 454       // Restore upper half of ZMM registers (0..15)
 455       base_addr = XSAVE_AREA_ZMM_BEGIN;
 456       for (int n = 0; n < 16; n++) {
 457         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 458       }
 459       // Restore full ZMM registers(16..num_xmm_regs)
 460       base_addr = XSAVE_AREA_UPPERBANK;
 461       int vector_len = Assembler::AVX_512bit;
 462       int off = 0;
 463       for (int n = 16; n < num_xmm_regs; n++) {
 464         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 465       }
 466 #if COMPILER2_OR_JVMCI
 467       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 468       off = 0;
 469       for (int n = 0; n < KRegister::number_of_registers; n++) {
 470         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 471       }
 472 #endif
 473     }
 474   } else {
 475     if (VM_Version::supports_evex()) {
 476       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 477       int base_addr = XSAVE_AREA_UPPERBANK;
 478       int off = 0;
 479       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 480       for (int n = 16; n < num_xmm_regs; n++) {
 481         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 482       }
 483 #if COMPILER2_OR_JVMCI
 484       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 485       off = 0;
 486       for (int n = 0; n < KRegister::number_of_registers; n++) {
 487         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 488       }
 489 #endif
 490     }
 491   }
 492 
 493 #if COMPILER2_OR_JVMCI
 494   if (UseAPX) {
 495     int base_addr = XSAVE_AREA_EGPRS;
 496     int off = 0;
 497     for (int n = 16; n < Register::number_of_registers; n++) {
 498       __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
 499     }
 500   }
 501 #endif
 502 
 503   // Recover CPU state
 504   __ pop_FPU_state();
 505   __ restore_legacy_gprs();
 506   __ addq(rsp, 8);
 507   __ popf();
 508   // Get the rbp described implicitly by the calling convention (no oopMap)
 509   __ pop(rbp);
 510 }
 511 
 512 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 513 
 514   // Just restore result register. Only used by deoptimization. By
 515   // now any callee save register that needs to be restored to a c2
 516   // caller of the deoptee has been extracted into the vframeArray
 517   // and will be stuffed into the c2i adapter we create for later
 518   // restoration so only result registers need to be restored here.
 519 
 520   // Restore fp result register
 521   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 522   // Restore integer result register
 523   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 524   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 525 
 526   // Pop all of the register save are off the stack except the return address
 527   __ addptr(rsp, return_offset_in_bytes());
 528 }
 529 
 530 // Is vector's size (in bytes) bigger than a size saved by default?
 531 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 532 bool SharedRuntime::is_wide_vector(int size) {
 533   return size > 16;
 534 }
 535 
 536 // ---------------------------------------------------------------------------
 537 // Read the array of BasicTypes from a signature, and compute where the
 538 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 539 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 540 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 541 // as framesizes are fixed.
 542 // VMRegImpl::stack0 refers to the first slot 0(sp).
 543 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 544 // Register up to Register::number_of_registers are the 64-bit
 545 // integer registers.
 546 
 547 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 548 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 549 // units regardless of build. Of course for i486 there is no 64 bit build
 550 
 551 // The Java calling convention is a "shifted" version of the C ABI.
 552 // By skipping the first C ABI register we can call non-static jni methods
 553 // with small numbers of arguments without having to shuffle the arguments
 554 // at all. Since we control the java ABI we ought to at least get some
 555 // advantage out of it.
 556 
 557 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 558                                            VMRegPair *regs,
 559                                            int total_args_passed) {
 560 
 561   // Create the mapping between argument positions and
 562   // registers.
 563   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 564     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 565   };
 566   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 567     j_farg0, j_farg1, j_farg2, j_farg3,
 568     j_farg4, j_farg5, j_farg6, j_farg7
 569   };
 570 
 571 
 572   uint int_args = 0;
 573   uint fp_args = 0;
 574   uint stk_args = 0;
 575 
 576   for (int i = 0; i < total_args_passed; i++) {
 577     switch (sig_bt[i]) {
 578     case T_BOOLEAN:
 579     case T_CHAR:
 580     case T_BYTE:
 581     case T_SHORT:
 582     case T_INT:
 583       if (int_args < Argument::n_int_register_parameters_j) {
 584         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 585       } else {
 586         stk_args = align_up(stk_args, 2);
 587         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 588         stk_args += 1;
 589       }
 590       break;
 591     case T_VOID:
 592       // halves of T_LONG or T_DOUBLE
 593       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 594       regs[i].set_bad();
 595       break;
 596     case T_LONG:
 597       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 598       // fall through
 599     case T_OBJECT:
 600     case T_ARRAY:
 601     case T_ADDRESS:
 602       if (int_args < Argument::n_int_register_parameters_j) {
 603         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 604       } else {
 605         stk_args = align_up(stk_args, 2);
 606         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 607         stk_args += 2;
 608       }
 609       break;
 610     case T_FLOAT:
 611       if (fp_args < Argument::n_float_register_parameters_j) {
 612         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 613       } else {
 614         stk_args = align_up(stk_args, 2);
 615         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 616         stk_args += 1;
 617       }
 618       break;
 619     case T_DOUBLE:
 620       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 621       if (fp_args < Argument::n_float_register_parameters_j) {
 622         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 623       } else {
 624         stk_args = align_up(stk_args, 2);
 625         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 626         stk_args += 2;
 627       }
 628       break;
 629     default:
 630       ShouldNotReachHere();
 631       break;
 632     }
 633   }
 634 
 635   return stk_args;
 636 }
 637 
 638 // Same as java_calling_convention() but for multiple return
 639 // values. There's no way to store them on the stack so if we don't
 640 // have enough registers, multiple values can't be returned.
 641 const uint SharedRuntime::java_return_convention_max_int = Argument::n_int_register_parameters_j+1;
 642 const uint SharedRuntime::java_return_convention_max_float = Argument::n_float_register_parameters_j;
 643 int SharedRuntime::java_return_convention(const BasicType *sig_bt,
 644                                           VMRegPair *regs,
 645                                           int total_args_passed) {
 646   // Create the mapping between argument positions and
 647   // registers.
 648   static const Register INT_ArgReg[java_return_convention_max_int] = {
 649     rax, j_rarg5, j_rarg4, j_rarg3, j_rarg2, j_rarg1, j_rarg0
 650   };
 651   static const XMMRegister FP_ArgReg[java_return_convention_max_float] = {
 652     j_farg0, j_farg1, j_farg2, j_farg3,
 653     j_farg4, j_farg5, j_farg6, j_farg7
 654   };
 655 
 656 
 657   uint int_args = 0;
 658   uint fp_args = 0;
 659 
 660   for (int i = 0; i < total_args_passed; i++) {
 661     switch (sig_bt[i]) {
 662     case T_BOOLEAN:
 663     case T_CHAR:
 664     case T_BYTE:
 665     case T_SHORT:
 666     case T_INT:
 667       if (int_args < Argument::n_int_register_parameters_j+1) {
 668         regs[i].set1(INT_ArgReg[int_args]->as_VMReg());
 669         int_args++;
 670       } else {
 671         return -1;
 672       }
 673       break;
 674     case T_VOID:
 675       // halves of T_LONG or T_DOUBLE
 676       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 677       regs[i].set_bad();
 678       break;
 679     case T_LONG:
 680       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 681       // fall through
 682     case T_OBJECT:
 683     case T_ARRAY:
 684     case T_ADDRESS:
 685     case T_METADATA:
 686       if (int_args < Argument::n_int_register_parameters_j+1) {
 687         regs[i].set2(INT_ArgReg[int_args]->as_VMReg());
 688         int_args++;
 689       } else {
 690         return -1;
 691       }
 692       break;
 693     case T_FLOAT:
 694       if (fp_args < Argument::n_float_register_parameters_j) {
 695         regs[i].set1(FP_ArgReg[fp_args]->as_VMReg());
 696         fp_args++;
 697       } else {
 698         return -1;
 699       }
 700       break;
 701     case T_DOUBLE:
 702       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 703       if (fp_args < Argument::n_float_register_parameters_j) {
 704         regs[i].set2(FP_ArgReg[fp_args]->as_VMReg());
 705         fp_args++;
 706       } else {
 707         return -1;
 708       }
 709       break;
 710     default:
 711       ShouldNotReachHere();
 712       break;
 713     }
 714   }
 715 
 716   return int_args + fp_args;
 717 }
 718 
 719 // Patch the callers callsite with entry to compiled code if it exists.
 720 static void patch_callers_callsite(MacroAssembler *masm) {
 721   Label L;
 722   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 723   __ jcc(Assembler::equal, L);
 724 
 725   // Save the current stack pointer
 726   __ mov(r13, rsp);
 727   // Schedule the branch target address early.
 728   // Call into the VM to patch the caller, then jump to compiled callee
 729   // rax isn't live so capture return address while we easily can
 730   __ movptr(rax, Address(rsp, 0));
 731 
 732   // align stack so push_CPU_state doesn't fault
 733   __ andptr(rsp, -(StackAlignmentInBytes));
 734   __ push_CPU_state();
 735   __ vzeroupper();
 736   // VM needs caller's callsite
 737   // VM needs target method
 738   // This needs to be a long call since we will relocate this adapter to
 739   // the codeBuffer and it may not reach
 740 
 741   // Allocate argument register save area
 742   if (frame::arg_reg_save_area_bytes != 0) {
 743     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 744   }
 745   __ mov(c_rarg0, rbx);
 746   __ mov(c_rarg1, rax);
 747   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 748 
 749   // De-allocate argument register save area
 750   if (frame::arg_reg_save_area_bytes != 0) {
 751     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 752   }
 753 
 754   __ vzeroupper();
 755   __ pop_CPU_state();
 756   // restore sp
 757   __ mov(rsp, r13);
 758   __ bind(L);
 759 }
 760 
 761 // For each inline type argument, sig includes the list of fields of
 762 // the inline type. This utility function computes the number of
 763 // arguments for the call if inline types are passed by reference (the
 764 // calling convention the interpreter expects).
 765 static int compute_total_args_passed_int(const GrowableArray<SigEntry>* sig_extended) {
 766   int total_args_passed = 0;
 767   if (InlineTypePassFieldsAsArgs) {
 768     for (int i = 0; i < sig_extended->length(); i++) {
 769       BasicType bt = sig_extended->at(i)._bt;
 770       if (bt == T_METADATA) {
 771         // In sig_extended, an inline type argument starts with:
 772         // T_METADATA, followed by the types of the fields of the
 773         // inline type and T_VOID to mark the end of the value
 774         // type. Inline types are flattened so, for instance, in the
 775         // case of an inline type with an int field and an inline type
 776         // field that itself has 2 fields, an int and a long:
 777         // T_METADATA T_INT T_METADATA T_INT T_LONG T_VOID (second
 778         // slot for the T_LONG) T_VOID (inner inline type) T_VOID
 779         // (outer inline type)
 780         total_args_passed++;
 781         int vt = 1;
 782         do {
 783           i++;
 784           BasicType bt = sig_extended->at(i)._bt;
 785           BasicType prev_bt = sig_extended->at(i-1)._bt;
 786           if (bt == T_METADATA) {
 787             vt++;
 788           } else if (bt == T_VOID &&
 789                      prev_bt != T_LONG &&
 790                      prev_bt != T_DOUBLE) {
 791             vt--;
 792           }
 793         } while (vt != 0);
 794       } else {
 795         total_args_passed++;
 796       }
 797     }
 798   } else {
 799     total_args_passed = sig_extended->length();
 800   }
 801   return total_args_passed;
 802 }
 803 
 804 
 805 static void gen_c2i_adapter_helper(MacroAssembler* masm,
 806                                    BasicType bt,
 807                                    BasicType prev_bt,
 808                                    size_t size_in_bytes,
 809                                    const VMRegPair& reg_pair,
 810                                    const Address& to,
 811                                    int extraspace,
 812                                    bool is_oop) {
 813   if (bt == T_VOID) {
 814     assert(prev_bt == T_LONG || prev_bt == T_DOUBLE, "missing half");
 815     return;
 816   }
 817 
 818   // Say 4 args:
 819   // i   st_off
 820   // 0   32 T_LONG
 821   // 1   24 T_VOID
 822   // 2   16 T_OBJECT
 823   // 3    8 T_BOOL
 824   // -    0 return address
 825   //
 826   // However to make thing extra confusing. Because we can fit a long/double in
 827   // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 828   // leaves one slot empty and only stores to a single slot. In this case the
 829   // slot that is occupied is the T_VOID slot. See I said it was confusing.
 830 
 831   bool wide = (size_in_bytes == wordSize);
 832   VMReg r_1 = reg_pair.first();
 833   VMReg r_2 = reg_pair.second();
 834   assert(r_2->is_valid() == wide, "invalid size");
 835   if (!r_1->is_valid()) {
 836     assert(!r_2->is_valid(), "must be invalid");
 837     return;
 838   }
 839 
 840   if (!r_1->is_XMMRegister()) {
 841     Register val = rax;
 842     if (r_1->is_stack()) {
 843       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 844       __ load_sized_value(val, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false);
 845     } else {
 846       val = r_1->as_Register();
 847     }
 848     assert_different_registers(to.base(), val, rscratch1);
 849     if (is_oop) {
 850       __ push(r13);
 851       __ push(rbx);
 852       // store_heap_oop transitively calls oop_store_at which corrupts to.base(). We need to keep it valid.
 853       __ push(to.base());
 854       __ store_heap_oop(to, val, rscratch1, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
 855       __ pop(to.base());
 856       __ pop(rbx);
 857       __ pop(r13);
 858     } else {
 859       __ store_sized_value(to, val, size_in_bytes);
 860     }
 861   } else {
 862     if (wide) {
 863       __ movdbl(to, r_1->as_XMMRegister());
 864     } else {
 865       __ movflt(to, r_1->as_XMMRegister());
 866     }
 867   }
 868 }
 869 
 870 static void gen_c2i_adapter(MacroAssembler *masm,
 871                             const GrowableArray<SigEntry>* sig_extended,
 872                             const VMRegPair *regs,
 873                             bool requires_clinit_barrier,
 874                             address& c2i_no_clinit_check_entry,
 875                             Label& skip_fixup,
 876                             address start,
 877                             OopMapSet* oop_maps,
 878                             int& frame_complete,
 879                             int& frame_size_in_words,
 880                             bool alloc_inline_receiver) {
 881   if (requires_clinit_barrier && VM_Version::supports_fast_class_init_checks()) {
 882     Label L_skip_barrier;
 883     Register method = rbx;
 884 
 885     { // Bypass the barrier for non-static methods
 886       Register flags = rscratch1;
 887       __ load_unsigned_short(flags, Address(method, Method::access_flags_offset()));
 888       __ testl(flags, JVM_ACC_STATIC);
 889       __ jcc(Assembler::zero, L_skip_barrier); // non-static
 890     }
 891 
 892     Register klass = rscratch1;
 893     __ load_method_holder(klass, method);
 894     __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
 895 
 896     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
 897 
 898     __ bind(L_skip_barrier);
 899     c2i_no_clinit_check_entry = __ pc();
 900   }
 901 
 902   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 903   bs->c2i_entry_barrier(masm);
 904 
 905   // Before we get into the guts of the C2I adapter, see if we should be here
 906   // at all.  We've come from compiled code and are attempting to jump to the
 907   // interpreter, which means the caller made a static call to get here
 908   // (vcalls always get a compiled target if there is one).  Check for a
 909   // compiled target.  If there is one, we need to patch the caller's call.
 910   patch_callers_callsite(masm);
 911 
 912   __ bind(skip_fixup);
 913 
 914   if (InlineTypePassFieldsAsArgs) {
 915     // Is there an inline type argument?
 916     bool has_inline_argument = false;
 917     for (int i = 0; i < sig_extended->length() && !has_inline_argument; i++) {
 918       has_inline_argument = (sig_extended->at(i)._bt == T_METADATA);
 919     }
 920     if (has_inline_argument) {
 921       // There is at least an inline type argument: we're coming from
 922       // compiled code so we have no buffers to back the inline types.
 923       // Allocate the buffers here with a runtime call.
 924       OopMap* map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
 925 
 926       frame_complete = __ offset();
 927 
 928       __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
 929 
 930       __ mov(c_rarg0, r15_thread);
 931       __ mov(c_rarg1, rbx);
 932       __ mov64(c_rarg2, (int64_t)alloc_inline_receiver);
 933       __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::allocate_inline_types)));
 934 
 935       oop_maps->add_gc_map((int)(__ pc() - start), map);
 936       __ reset_last_Java_frame(false);
 937 
 938       RegisterSaver::restore_live_registers(masm);
 939 
 940       Label no_exception;
 941       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
 942       __ jcc(Assembler::equal, no_exception);
 943 
 944       __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
 945       __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 946       __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
 947 
 948       __ bind(no_exception);
 949 
 950       // We get an array of objects from the runtime call
 951       __ get_vm_result_oop(rscratch2); // Use rscratch2 (r11) as temporary because rscratch1 (r10) is trashed by movptr()
 952       __ get_vm_result_metadata(rbx); // TODO: required to keep the callee Method live?
 953     }
 954   }
 955 
 956   // Since all args are passed on the stack, total_args_passed *
 957   // Interpreter::stackElementSize is the space we need.
 958   int total_args_passed = compute_total_args_passed_int(sig_extended);
 959   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 960 
 961   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 962 
 963   // stack is aligned, keep it that way
 964   // This is not currently needed or enforced by the interpreter, but
 965   // we might as well conform to the ABI.
 966   extraspace = align_up(extraspace, 2*wordSize);
 967 
 968   // set senderSP value
 969   __ lea(r13, Address(rsp, wordSize));
 970 
 971 #ifdef ASSERT
 972   __ check_stack_alignment(r13, "sender stack not aligned");
 973 #endif
 974   if (extraspace > 0) {
 975     // Pop the return address
 976     __ pop(rax);
 977 
 978     __ subptr(rsp, extraspace);
 979 
 980     // Push the return address
 981     __ push(rax);
 982 
 983     // Account for the return address location since we store it first rather
 984     // than hold it in a register across all the shuffling
 985     extraspace += wordSize;
 986   }
 987 
 988 #ifdef ASSERT
 989   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 990 #endif
 991 
 992   // Now write the args into the outgoing interpreter space
 993 
 994   // next_arg_comp is the next argument from the compiler point of
 995   // view (inline type fields are passed in registers/on the stack). In
 996   // sig_extended, an inline type argument starts with: T_METADATA,
 997   // followed by the types of the fields of the inline type and T_VOID
 998   // to mark the end of the inline type. ignored counts the number of
 999   // T_METADATA/T_VOID. next_vt_arg is the next inline type argument:
1000   // used to get the buffer for that argument from the pool of buffers
1001   // we allocated above and want to pass to the
1002   // interpreter. next_arg_int is the next argument from the
1003   // interpreter point of view (inline types are passed by reference).
1004   for (int next_arg_comp = 0, ignored = 0, next_vt_arg = 0, next_arg_int = 0;
1005        next_arg_comp < sig_extended->length(); next_arg_comp++) {
1006     assert(ignored <= next_arg_comp, "shouldn't skip over more slots than there are arguments");
1007     assert(next_arg_int <= total_args_passed, "more arguments for the interpreter than expected?");
1008     BasicType bt = sig_extended->at(next_arg_comp)._bt;
1009     int st_off = (total_args_passed - next_arg_int) * Interpreter::stackElementSize;
1010     if (!InlineTypePassFieldsAsArgs || bt != T_METADATA) {
1011       int next_off = st_off - Interpreter::stackElementSize;
1012       const int offset = (bt == T_LONG || bt == T_DOUBLE) ? next_off : st_off;
1013       const VMRegPair reg_pair = regs[next_arg_comp-ignored];
1014       size_t size_in_bytes = reg_pair.second()->is_valid() ? 8 : 4;
1015       gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
1016                              size_in_bytes, reg_pair, Address(rsp, offset), extraspace, false);
1017       next_arg_int++;
1018 #ifdef ASSERT
1019       if (bt == T_LONG || bt == T_DOUBLE) {
1020         // Overwrite the unused slot with known junk
1021         __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
1022         __ movptr(Address(rsp, st_off), rax);
1023       }
1024 #endif /* ASSERT */
1025     } else {
1026       ignored++;
1027       // get the buffer from the just allocated pool of buffers
1028       int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + next_vt_arg * type2aelembytes(T_OBJECT);
1029       __ load_heap_oop(r14, Address(rscratch2, index));
1030       next_vt_arg++; next_arg_int++;
1031       int vt = 1;
1032       // write fields we get from compiled code in registers/stack
1033       // slots to the buffer: we know we are done with that inline type
1034       // argument when we hit the T_VOID that acts as an end of inline
1035       // type delimiter for this inline type. Inline types are flattened
1036       // so we might encounter embedded inline types. Each entry in
1037       // sig_extended contains a field offset in the buffer.
1038       Label L_null;
1039       do {
1040         next_arg_comp++;
1041         BasicType bt = sig_extended->at(next_arg_comp)._bt;
1042         BasicType prev_bt = sig_extended->at(next_arg_comp-1)._bt;
1043         if (bt == T_METADATA) {
1044           vt++;
1045           ignored++;
1046         } else if (bt == T_VOID &&
1047                    prev_bt != T_LONG &&
1048                    prev_bt != T_DOUBLE) {
1049           vt--;
1050           ignored++;
1051         } else {
1052           int off = sig_extended->at(next_arg_comp)._offset;
1053           if (off == -1) {
1054             // Nullable inline type argument, emit null check
1055             VMReg reg = regs[next_arg_comp-ignored].first();
1056             Label L_notNull;
1057             if (reg->is_stack()) {
1058               int ld_off = reg->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
1059               __ testb(Address(rsp, ld_off), 1);
1060             } else {
1061               __ testb(reg->as_Register(), 1);
1062             }
1063             __ jcc(Assembler::notZero, L_notNull);
1064             __ movptr(Address(rsp, st_off), 0);
1065             __ jmp(L_null);
1066             __ bind(L_notNull);
1067             continue;
1068           }
1069           assert(off > 0, "offset in object should be positive");
1070           size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize;
1071           bool is_oop = is_reference_type(bt);
1072           gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
1073                                  size_in_bytes, regs[next_arg_comp-ignored], Address(r14, off), extraspace, is_oop);
1074         }
1075       } while (vt != 0);
1076       // pass the buffer to the interpreter
1077       __ movptr(Address(rsp, st_off), r14);
1078       __ bind(L_null);
1079     }
1080   }
1081 
1082   // Schedule the branch target address early.
1083   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
1084   __ jmp(rcx);
1085 }
1086 
1087 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
1088                                     int comp_args_on_stack,
1089                                     const GrowableArray<SigEntry>* sig,
1090                                     const VMRegPair *regs) {
1091 
1092   // Note: r13 contains the senderSP on entry. We must preserve it since
1093   // we may do a i2c -> c2i transition if we lose a race where compiled
1094   // code goes non-entrant while we get args ready.
1095   // In addition we use r13 to locate all the interpreter args as
1096   // we must align the stack to 16 bytes on an i2c entry else we
1097   // lose alignment we expect in all compiled code and register
1098   // save code can segv when fxsave instructions find improperly
1099   // aligned stack pointer.
1100 
1101   // Adapters can be frameless because they do not require the caller
1102   // to perform additional cleanup work, such as correcting the stack pointer.
1103   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
1104   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
1105   // even if a callee has modified the stack pointer.
1106   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
1107   // routinely repairs its caller's stack pointer (from sender_sp, which is set
1108   // up via the senderSP register).
1109   // In other words, if *either* the caller or callee is interpreted, we can
1110   // get the stack pointer repaired after a call.
1111   // This is why c2i and i2c adapters cannot be indefinitely composed.
1112   // In particular, if a c2i adapter were to somehow call an i2c adapter,
1113   // both caller and callee would be compiled methods, and neither would
1114   // clean up the stack pointer changes performed by the two adapters.
1115   // If this happens, control eventually transfers back to the compiled
1116   // caller, but with an uncorrected stack, causing delayed havoc.
1117 
1118   // Must preserve original SP for loading incoming arguments because
1119   // we need to align the outgoing SP for compiled code.
1120   __ movptr(r11, rsp);
1121 
1122   // Pick up the return address
1123   __ pop(rax);
1124 
1125   // Convert 4-byte c2 stack slots to words.
1126   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
1127 
1128   if (comp_args_on_stack) {
1129     __ subptr(rsp, comp_words_on_stack * wordSize);
1130   }
1131 
1132   // Ensure compiled code always sees stack at proper alignment
1133   __ andptr(rsp, -16);
1134 
1135   // push the return address and misalign the stack that youngest frame always sees
1136   // as far as the placement of the call instruction
1137   __ push(rax);
1138 
1139   // Put saved SP in another register
1140   const Register saved_sp = rax;
1141   __ movptr(saved_sp, r11);
1142 
1143   // Will jump to the compiled code just as if compiled code was doing it.
1144   // Pre-load the register-jump target early, to schedule it better.
1145   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_inline_offset())));
1146 
1147 #if INCLUDE_JVMCI
1148   if (EnableJVMCI) {
1149     // check if this call should be routed towards a specific entry point
1150     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1151     Label no_alternative_target;
1152     __ jcc(Assembler::equal, no_alternative_target);
1153     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
1154     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1155     __ bind(no_alternative_target);
1156   }
1157 #endif // INCLUDE_JVMCI
1158 
1159   int total_args_passed = sig->length();
1160 
1161   // Now generate the shuffle code.  Pick up all register args and move the
1162   // rest through the floating point stack top.
1163   for (int i = 0; i < total_args_passed; i++) {
1164     BasicType bt = sig->at(i)._bt;
1165     if (bt == T_VOID) {
1166       // Longs and doubles are passed in native word order, but misaligned
1167       // in the 32-bit build.
1168       BasicType prev_bt = (i > 0) ? sig->at(i-1)._bt : T_ILLEGAL;
1169       assert(i > 0 && (prev_bt == T_LONG || prev_bt == T_DOUBLE), "missing half");
1170       continue;
1171     }
1172 
1173     // Pick up 0, 1 or 2 words from SP+offset.
1174 
1175     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
1176             "scrambled load targets?");
1177     // Load in argument order going down.
1178     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
1179     // Point to interpreter value (vs. tag)
1180     int next_off = ld_off - Interpreter::stackElementSize;
1181     //
1182     //
1183     //
1184     VMReg r_1 = regs[i].first();
1185     VMReg r_2 = regs[i].second();
1186     if (!r_1->is_valid()) {
1187       assert(!r_2->is_valid(), "");
1188       continue;
1189     }
1190     if (r_1->is_stack()) {
1191       // Convert stack slot to an SP offset (+ wordSize to account for return address )
1192       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
1193 
1194       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
1195       // and if we end up going thru a c2i because of a miss a reasonable value of r13
1196       // will be generated.
1197       if (!r_2->is_valid()) {
1198         // sign extend???
1199         __ movl(r13, Address(saved_sp, ld_off));
1200         __ movptr(Address(rsp, st_off), r13);
1201       } else {
1202         //
1203         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1204         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1205         // So we must adjust where to pick up the data to match the interpreter.
1206         //
1207         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
1208         // are accessed as negative so LSW is at LOW address
1209 
1210         // ld_off is MSW so get LSW
1211         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1212                            next_off : ld_off;
1213         __ movq(r13, Address(saved_sp, offset));
1214         // st_off is LSW (i.e. reg.first())
1215         __ movq(Address(rsp, st_off), r13);
1216       }
1217     } else if (r_1->is_Register()) {  // Register argument
1218       Register r = r_1->as_Register();
1219       assert(r != rax, "must be different");
1220       if (r_2->is_valid()) {
1221         //
1222         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1223         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1224         // So we must adjust where to pick up the data to match the interpreter.
1225 
1226         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1227                            next_off : ld_off;
1228 
1229         // this can be a misaligned move
1230         __ movq(r, Address(saved_sp, offset));
1231       } else {
1232         // sign extend and use a full word?
1233         __ movl(r, Address(saved_sp, ld_off));
1234       }
1235     } else {
1236       if (!r_2->is_valid()) {
1237         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
1238       } else {
1239         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
1240       }
1241     }
1242   }
1243 
1244   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
1245 
1246   // 6243940 We might end up in handle_wrong_method if
1247   // the callee is deoptimized as we race thru here. If that
1248   // happens we don't want to take a safepoint because the
1249   // caller frame will look interpreted and arguments are now
1250   // "compiled" so it is much better to make this transition
1251   // invisible to the stack walking code. Unfortunately if
1252   // we try and find the callee by normal means a safepoint
1253   // is possible. So we stash the desired callee in the thread
1254   // and the vm will find there should this case occur.
1255 
1256   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1257 
1258   // put Method* where a c2i would expect should we end up there
1259   // only needed because of c2 resolve stubs return Method* as a result in
1260   // rax
1261   __ mov(rax, rbx);
1262   __ jmp(r11);
1263 }
1264 
1265 static void gen_inline_cache_check(MacroAssembler *masm, Label& skip_fixup) {
1266   Register data = rax;
1267   __ ic_check(1 /* end_alignment */);
1268   __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1269 
1270   // Method might have been compiled since the call site was patched to
1271   // interpreted if that is the case treat it as a miss so we can get
1272   // the call site corrected.
1273   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1274   __ jcc(Assembler::equal, skip_fixup);
1275   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1276 }
1277 
1278 // ---------------------------------------------------------------
1279 void SharedRuntime::generate_i2c2i_adapters(MacroAssembler* masm,
1280                                             int comp_args_on_stack,
1281                                             const GrowableArray<SigEntry>* sig,
1282                                             const VMRegPair* regs,
1283                                             const GrowableArray<SigEntry>* sig_cc,
1284                                             const VMRegPair* regs_cc,
1285                                             const GrowableArray<SigEntry>* sig_cc_ro,
1286                                             const VMRegPair* regs_cc_ro,
1287                                             address entry_address[AdapterBlob::ENTRY_COUNT],
1288                                             AdapterBlob*& new_adapter,
1289                                             bool allocate_code_blob) {
1290   entry_address[AdapterBlob::I2C] = __ pc();
1291   gen_i2c_adapter(masm, comp_args_on_stack, sig, regs);
1292 
1293   // -------------------------------------------------------------------------
1294   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1295   // to the interpreter.  The args start out packed in the compiled layout.  They
1296   // need to be unpacked into the interpreter layout.  This will almost always
1297   // require some stack space.  We grow the current (compiled) stack, then repack
1298   // the args.  We  finally end in a jump to the generic interpreter entry point.
1299   // On exit from the interpreter, the interpreter will restore our SP (lest the
1300   // compiled code, which relies solely on SP and not RBP, get sick).
1301 
1302   entry_address[AdapterBlob::C2I_Unverified] = __ pc();
1303   entry_address[AdapterBlob::C2I_Unverified_Inline] = __ pc();
1304   Label skip_fixup;
1305 
1306   gen_inline_cache_check(masm, skip_fixup);
1307 
1308   OopMapSet* oop_maps = new OopMapSet();
1309   int frame_complete = CodeOffsets::frame_never_safe;
1310   int frame_size_in_words = 0;
1311 
1312   // Scalarized c2i adapter with non-scalarized receiver (i.e., don't pack receiver)
1313   entry_address[AdapterBlob::C2I_No_Clinit_Check] = nullptr;
1314   entry_address[AdapterBlob::C2I_Inline_RO] = __ pc();
1315   if (regs_cc != regs_cc_ro) {
1316     // No class init barrier needed because method is guaranteed to be non-static
1317     gen_c2i_adapter(masm, sig_cc_ro, regs_cc_ro, /* requires_clinit_barrier = */ false, entry_address[AdapterBlob::C2I_No_Clinit_Check],
1318                     skip_fixup, entry_address[AdapterBlob::I2C], oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1319     skip_fixup.reset();
1320   }
1321 
1322   // Scalarized c2i adapter
1323   entry_address[AdapterBlob::C2I]        = __ pc();
1324   entry_address[AdapterBlob::C2I_Inline] = __ pc();
1325   gen_c2i_adapter(masm, sig_cc, regs_cc, /* requires_clinit_barrier = */ true, entry_address[AdapterBlob::C2I_No_Clinit_Check],
1326                   skip_fixup, entry_address[AdapterBlob::I2C], oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ true);
1327 
1328   // Non-scalarized c2i adapter
1329   if (regs != regs_cc) {
1330     entry_address[AdapterBlob::C2I_Unverified_Inline] = __ pc();
1331     Label inline_entry_skip_fixup;
1332     gen_inline_cache_check(masm, inline_entry_skip_fixup);
1333 
1334     entry_address[AdapterBlob::C2I_Inline] = __ pc();
1335     gen_c2i_adapter(masm, sig, regs, /* requires_clinit_barrier = */ true, entry_address[AdapterBlob::C2I_No_Clinit_Check],
1336                     inline_entry_skip_fixup, entry_address[AdapterBlob::I2C], oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1337   }
1338 
1339   // The c2i adapters might safepoint and trigger a GC. The caller must make sure that
1340   // the GC knows about the location of oop argument locations passed to the c2i adapter.
1341   if (allocate_code_blob) {
1342     bool caller_must_gc_arguments = (regs != regs_cc);
1343     int entry_offset[AdapterHandlerEntry::ENTRIES_COUNT];
1344     assert(AdapterHandlerEntry::ENTRIES_COUNT == 7, "sanity");
1345     AdapterHandlerLibrary::address_to_offset(entry_address, entry_offset);
1346     new_adapter = AdapterBlob::create(masm->code(), entry_offset, frame_complete, frame_size_in_words, oop_maps, caller_must_gc_arguments);
1347   }
1348 }
1349 
1350 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1351                                          VMRegPair *regs,
1352                                          int total_args_passed) {
1353 
1354 // We return the amount of VMRegImpl stack slots we need to reserve for all
1355 // the arguments NOT counting out_preserve_stack_slots.
1356 
1357 // NOTE: These arrays will have to change when c1 is ported
1358 #ifdef _WIN64
1359     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1360       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1361     };
1362     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1363       c_farg0, c_farg1, c_farg2, c_farg3
1364     };
1365 #else
1366     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1367       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1368     };
1369     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1370       c_farg0, c_farg1, c_farg2, c_farg3,
1371       c_farg4, c_farg5, c_farg6, c_farg7
1372     };
1373 #endif // _WIN64
1374 
1375 
1376     uint int_args = 0;
1377     uint fp_args = 0;
1378     uint stk_args = 0; // inc by 2 each time
1379 
1380     for (int i = 0; i < total_args_passed; i++) {
1381       switch (sig_bt[i]) {
1382       case T_BOOLEAN:
1383       case T_CHAR:
1384       case T_BYTE:
1385       case T_SHORT:
1386       case T_INT:
1387         if (int_args < Argument::n_int_register_parameters_c) {
1388           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1389 #ifdef _WIN64
1390           fp_args++;
1391           // Allocate slots for callee to stuff register args the stack.
1392           stk_args += 2;
1393 #endif
1394         } else {
1395           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1396           stk_args += 2;
1397         }
1398         break;
1399       case T_LONG:
1400         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1401         // fall through
1402       case T_OBJECT:
1403       case T_ARRAY:
1404       case T_ADDRESS:
1405       case T_METADATA:
1406         if (int_args < Argument::n_int_register_parameters_c) {
1407           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1408 #ifdef _WIN64
1409           fp_args++;
1410           stk_args += 2;
1411 #endif
1412         } else {
1413           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1414           stk_args += 2;
1415         }
1416         break;
1417       case T_FLOAT:
1418         if (fp_args < Argument::n_float_register_parameters_c) {
1419           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1420 #ifdef _WIN64
1421           int_args++;
1422           // Allocate slots for callee to stuff register args the stack.
1423           stk_args += 2;
1424 #endif
1425         } else {
1426           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1427           stk_args += 2;
1428         }
1429         break;
1430       case T_DOUBLE:
1431         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1432         if (fp_args < Argument::n_float_register_parameters_c) {
1433           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1434 #ifdef _WIN64
1435           int_args++;
1436           // Allocate slots for callee to stuff register args the stack.
1437           stk_args += 2;
1438 #endif
1439         } else {
1440           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1441           stk_args += 2;
1442         }
1443         break;
1444       case T_VOID: // Halves of longs and doubles
1445         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1446         regs[i].set_bad();
1447         break;
1448       default:
1449         ShouldNotReachHere();
1450         break;
1451       }
1452     }
1453 #ifdef _WIN64
1454   // windows abi requires that we always allocate enough stack space
1455   // for 4 64bit registers to be stored down.
1456   if (stk_args < 8) {
1457     stk_args = 8;
1458   }
1459 #endif // _WIN64
1460 
1461   return stk_args;
1462 }
1463 
1464 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1465                                              uint num_bits,
1466                                              uint total_args_passed) {
1467   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1468          "only certain vector sizes are supported for now");
1469 
1470   static const XMMRegister VEC_ArgReg[32] = {
1471      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1472      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1473     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1474     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1475   };
1476 
1477   uint stk_args = 0;
1478   uint fp_args = 0;
1479 
1480   for (uint i = 0; i < total_args_passed; i++) {
1481     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1482     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1483     regs[i].set_pair(vmreg->next(next_val), vmreg);
1484   }
1485 
1486   return stk_args;
1487 }
1488 
1489 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1490   // We always ignore the frame_slots arg and just use the space just below frame pointer
1491   // which by this time is free to use
1492   switch (ret_type) {
1493   case T_FLOAT:
1494     __ movflt(Address(rbp, -wordSize), xmm0);
1495     break;
1496   case T_DOUBLE:
1497     __ movdbl(Address(rbp, -wordSize), xmm0);
1498     break;
1499   case T_VOID:  break;
1500   default: {
1501     __ movptr(Address(rbp, -wordSize), rax);
1502     }
1503   }
1504 }
1505 
1506 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1507   // We always ignore the frame_slots arg and just use the space just below frame pointer
1508   // which by this time is free to use
1509   switch (ret_type) {
1510   case T_FLOAT:
1511     __ movflt(xmm0, Address(rbp, -wordSize));
1512     break;
1513   case T_DOUBLE:
1514     __ movdbl(xmm0, Address(rbp, -wordSize));
1515     break;
1516   case T_VOID:  break;
1517   default: {
1518     __ movptr(rax, Address(rbp, -wordSize));
1519     }
1520   }
1521 }
1522 
1523 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1524     for ( int i = first_arg ; i < arg_count ; i++ ) {
1525       if (args[i].first()->is_Register()) {
1526         __ push(args[i].first()->as_Register());
1527       } else if (args[i].first()->is_XMMRegister()) {
1528         __ subptr(rsp, 2*wordSize);
1529         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1530       }
1531     }
1532 }
1533 
1534 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1535     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1536       if (args[i].first()->is_Register()) {
1537         __ pop(args[i].first()->as_Register());
1538       } else if (args[i].first()->is_XMMRegister()) {
1539         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1540         __ addptr(rsp, 2*wordSize);
1541       }
1542     }
1543 }
1544 
1545 static void verify_oop_args(MacroAssembler* masm,
1546                             const methodHandle& method,
1547                             const BasicType* sig_bt,
1548                             const VMRegPair* regs) {
1549   Register temp_reg = rbx;  // not part of any compiled calling seq
1550   if (VerifyOops) {
1551     for (int i = 0; i < method->size_of_parameters(); i++) {
1552       if (is_reference_type(sig_bt[i])) {
1553         VMReg r = regs[i].first();
1554         assert(r->is_valid(), "bad oop arg");
1555         if (r->is_stack()) {
1556           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1557           __ verify_oop(temp_reg);
1558         } else {
1559           __ verify_oop(r->as_Register());
1560         }
1561       }
1562     }
1563   }
1564 }
1565 
1566 static void check_continuation_enter_argument(VMReg actual_vmreg,
1567                                               Register expected_reg,
1568                                               const char* name) {
1569   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1570   assert(actual_vmreg->as_Register() == expected_reg,
1571          "%s is in unexpected register: %s instead of %s",
1572          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1573 }
1574 
1575 
1576 //---------------------------- continuation_enter_setup ---------------------------
1577 //
1578 // Arguments:
1579 //   None.
1580 //
1581 // Results:
1582 //   rsp: pointer to blank ContinuationEntry
1583 //
1584 // Kills:
1585 //   rax
1586 //
1587 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1588   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1589   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1590   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1591 
1592   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1593   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1594 
1595   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1596   OopMap* map = new OopMap(frame_size, 0);
1597 
1598   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1599   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1600   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1601 
1602   return map;
1603 }
1604 
1605 //---------------------------- fill_continuation_entry ---------------------------
1606 //
1607 // Arguments:
1608 //   rsp: pointer to blank Continuation entry
1609 //   reg_cont_obj: pointer to the continuation
1610 //   reg_flags: flags
1611 //
1612 // Results:
1613 //   rsp: pointer to filled out ContinuationEntry
1614 //
1615 // Kills:
1616 //   rax
1617 //
1618 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1619   assert_different_registers(rax, reg_cont_obj, reg_flags);
1620 #ifdef ASSERT
1621   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1622 #endif
1623   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1624   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1625   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1626   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1627   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1628 
1629   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1630   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1631 
1632   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1633 }
1634 
1635 //---------------------------- continuation_enter_cleanup ---------------------------
1636 //
1637 // Arguments:
1638 //   rsp: pointer to the ContinuationEntry
1639 //
1640 // Results:
1641 //   rsp: pointer to the spilled rbp in the entry frame
1642 //
1643 // Kills:
1644 //   rbx
1645 //
1646 static void continuation_enter_cleanup(MacroAssembler* masm) {
1647 #ifdef ASSERT
1648   Label L_good_sp;
1649   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1650   __ jcc(Assembler::equal, L_good_sp);
1651   __ stop("Incorrect rsp at continuation_enter_cleanup");
1652   __ bind(L_good_sp);
1653 #endif
1654   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1655   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1656   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1657   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1658   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1659 }
1660 
1661 static void gen_continuation_enter(MacroAssembler* masm,
1662                                    const VMRegPair* regs,
1663                                    int& exception_offset,
1664                                    OopMapSet* oop_maps,
1665                                    int& frame_complete,
1666                                    int& stack_slots,
1667                                    int& interpreted_entry_offset,
1668                                    int& compiled_entry_offset) {
1669 
1670   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1671   int pos_cont_obj   = 0;
1672   int pos_is_cont    = 1;
1673   int pos_is_virtual = 2;
1674 
1675   // The platform-specific calling convention may present the arguments in various registers.
1676   // To simplify the rest of the code, we expect the arguments to reside at these known
1677   // registers, and we additionally check the placement here in case calling convention ever
1678   // changes.
1679   Register reg_cont_obj   = c_rarg1;
1680   Register reg_is_cont    = c_rarg2;
1681   Register reg_is_virtual = c_rarg3;
1682 
1683   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1684   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1685   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1686 
1687   // Utility methods kill rax, make sure there are no collisions
1688   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1689 
1690   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1691                          relocInfo::static_call_type);
1692 
1693   address start = __ pc();
1694 
1695   Label L_thaw, L_exit;
1696 
1697   // i2i entry used at interp_only_mode only
1698   interpreted_entry_offset = __ pc() - start;
1699   {
1700 #ifdef ASSERT
1701     Label is_interp_only;
1702     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1703     __ jcc(Assembler::notEqual, is_interp_only);
1704     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1705     __ bind(is_interp_only);
1706 #endif
1707 
1708     __ pop(rax); // return address
1709     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1710     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1711     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1712     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1713     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1714     __ push(rax); // return address
1715     __ push_cont_fastpath();
1716 
1717     __ enter();
1718 
1719     stack_slots = 2; // will be adjusted in setup
1720     OopMap* map = continuation_enter_setup(masm, stack_slots);
1721     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1722     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1723 
1724     __ verify_oop(reg_cont_obj);
1725 
1726     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1727 
1728     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1729     __ testptr(reg_is_cont, reg_is_cont);
1730     __ jcc(Assembler::notZero, L_thaw);
1731 
1732     // --- Resolve path
1733 
1734     // Make sure the call is patchable
1735     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1736     // Emit stub for static call
1737     address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1738     if (stub == nullptr) {
1739       fatal("CodeCache is full at gen_continuation_enter");
1740     }
1741     __ call(resolve);
1742     oop_maps->add_gc_map(__ pc() - start, map);
1743     __ post_call_nop();
1744 
1745     __ jmp(L_exit);
1746   }
1747 
1748   // compiled entry
1749   __ align(CodeEntryAlignment);
1750   compiled_entry_offset = __ pc() - start;
1751   __ enter();
1752 
1753   stack_slots = 2; // will be adjusted in setup
1754   OopMap* map = continuation_enter_setup(masm, stack_slots);
1755 
1756   // Frame is now completed as far as size and linkage.
1757   frame_complete = __ pc() - start;
1758 
1759   __ verify_oop(reg_cont_obj);
1760 
1761   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1762 
1763   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1764   __ testptr(reg_is_cont, reg_is_cont);
1765   __ jccb(Assembler::notZero, L_thaw);
1766 
1767   // --- call Continuation.enter(Continuation c, boolean isContinue)
1768 
1769   // Make sure the call is patchable
1770   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1771 
1772   // Emit stub for static call
1773   address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1774   if (stub == nullptr) {
1775     fatal("CodeCache is full at gen_continuation_enter");
1776   }
1777 
1778   // The call needs to be resolved. There's a special case for this in
1779   // SharedRuntime::find_callee_info_helper() which calls
1780   // LinkResolver::resolve_continuation_enter() which resolves the call to
1781   // Continuation.enter(Continuation c, boolean isContinue).
1782   __ call(resolve);
1783 
1784   oop_maps->add_gc_map(__ pc() - start, map);
1785   __ post_call_nop();
1786 
1787   __ jmpb(L_exit);
1788 
1789   // --- Thawing path
1790 
1791   __ bind(L_thaw);
1792 
1793   ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1794   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1795 
1796   ContinuationEntry::_return_pc_offset = __ pc() - start;
1797   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1798   __ post_call_nop();
1799 
1800   // --- Normal exit (resolve/thawing)
1801 
1802   __ bind(L_exit);
1803   ContinuationEntry::_cleanup_offset = __ pc() - start;
1804   continuation_enter_cleanup(masm);
1805   __ pop(rbp);
1806   __ ret(0);
1807 
1808   // --- Exception handling path
1809 
1810   exception_offset = __ pc() - start;
1811 
1812   continuation_enter_cleanup(masm);
1813   __ pop(rbp);
1814 
1815   __ movptr(c_rarg0, r15_thread);
1816   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1817 
1818   // rax still holds the original exception oop, save it before the call
1819   __ push(rax);
1820 
1821   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1822   __ movptr(rbx, rax);
1823 
1824   // Continue at exception handler:
1825   //   rax: exception oop
1826   //   rbx: exception handler
1827   //   rdx: exception pc
1828   __ pop(rax);
1829   __ verify_oop(rax);
1830   __ pop(rdx);
1831   __ jmp(rbx);
1832 }
1833 
1834 static void gen_continuation_yield(MacroAssembler* masm,
1835                                    const VMRegPair* regs,
1836                                    OopMapSet* oop_maps,
1837                                    int& frame_complete,
1838                                    int& stack_slots,
1839                                    int& compiled_entry_offset) {
1840   enum layout {
1841     rbp_off,
1842     rbpH_off,
1843     return_off,
1844     return_off2,
1845     framesize // inclusive of return address
1846   };
1847   stack_slots = framesize /  VMRegImpl::slots_per_word;
1848   assert(stack_slots == 2, "recheck layout");
1849 
1850   address start = __ pc();
1851   compiled_entry_offset = __ pc() - start;
1852   __ enter();
1853   address the_pc = __ pc();
1854 
1855   frame_complete = the_pc - start;
1856 
1857   // This nop must be exactly at the PC we push into the frame info.
1858   // We use this nop for fast CodeBlob lookup, associate the OopMap
1859   // with it right away.
1860   __ post_call_nop();
1861   OopMap* map = new OopMap(framesize, 1);
1862   oop_maps->add_gc_map(frame_complete, map);
1863 
1864   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1865   __ movptr(c_rarg0, r15_thread);
1866   __ movptr(c_rarg1, rsp);
1867   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1868   __ reset_last_Java_frame(true);
1869 
1870   Label L_pinned;
1871 
1872   __ testptr(rax, rax);
1873   __ jcc(Assembler::notZero, L_pinned);
1874 
1875   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1876   continuation_enter_cleanup(masm);
1877   __ pop(rbp);
1878   __ ret(0);
1879 
1880   __ bind(L_pinned);
1881 
1882   // Pinned, return to caller
1883 
1884   // handle pending exception thrown by freeze
1885   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1886   Label ok;
1887   __ jcc(Assembler::equal, ok);
1888   __ leave();
1889   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1890   __ bind(ok);
1891 
1892   __ leave();
1893   __ ret(0);
1894 }
1895 
1896 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1897   ::continuation_enter_cleanup(masm);
1898 }
1899 
1900 static void gen_special_dispatch(MacroAssembler* masm,
1901                                  const methodHandle& method,
1902                                  const BasicType* sig_bt,
1903                                  const VMRegPair* regs) {
1904   verify_oop_args(masm, method, sig_bt, regs);
1905   vmIntrinsics::ID iid = method->intrinsic_id();
1906 
1907   // Now write the args into the outgoing interpreter space
1908   bool     has_receiver   = false;
1909   Register receiver_reg   = noreg;
1910   int      member_arg_pos = -1;
1911   Register member_reg     = noreg;
1912   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1913   if (ref_kind != 0) {
1914     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1915     member_reg = rbx;  // known to be free at this point
1916     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1917   } else if (iid == vmIntrinsics::_invokeBasic) {
1918     has_receiver = true;
1919   } else if (iid == vmIntrinsics::_linkToNative) {
1920     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1921     member_reg = rbx;  // known to be free at this point
1922   } else {
1923     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1924   }
1925 
1926   if (member_reg != noreg) {
1927     // Load the member_arg into register, if necessary.
1928     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1929     VMReg r = regs[member_arg_pos].first();
1930     if (r->is_stack()) {
1931       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1932     } else {
1933       // no data motion is needed
1934       member_reg = r->as_Register();
1935     }
1936   }
1937 
1938   if (has_receiver) {
1939     // Make sure the receiver is loaded into a register.
1940     assert(method->size_of_parameters() > 0, "oob");
1941     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1942     VMReg r = regs[0].first();
1943     assert(r->is_valid(), "bad receiver arg");
1944     if (r->is_stack()) {
1945       // Porting note:  This assumes that compiled calling conventions always
1946       // pass the receiver oop in a register.  If this is not true on some
1947       // platform, pick a temp and load the receiver from stack.
1948       fatal("receiver always in a register");
1949       receiver_reg = j_rarg0;  // known to be free at this point
1950       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1951     } else {
1952       // no data motion is needed
1953       receiver_reg = r->as_Register();
1954     }
1955   }
1956 
1957   // Figure out which address we are really jumping to:
1958   MethodHandles::generate_method_handle_dispatch(masm, iid,
1959                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1960 }
1961 
1962 // ---------------------------------------------------------------------------
1963 // Generate a native wrapper for a given method.  The method takes arguments
1964 // in the Java compiled code convention, marshals them to the native
1965 // convention (handlizes oops, etc), transitions to native, makes the call,
1966 // returns to java state (possibly blocking), unhandlizes any result and
1967 // returns.
1968 //
1969 // Critical native functions are a shorthand for the use of
1970 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1971 // functions.  The wrapper is expected to unpack the arguments before
1972 // passing them to the callee. Critical native functions leave the state _in_Java,
1973 // since they cannot stop for GC.
1974 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1975 // block and the check for pending exceptions it's impossible for them
1976 // to be thrown.
1977 //
1978 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1979                                                 const methodHandle& method,
1980                                                 int compile_id,
1981                                                 BasicType* in_sig_bt,
1982                                                 VMRegPair* in_regs,
1983                                                 BasicType ret_type) {
1984   if (method->is_continuation_native_intrinsic()) {
1985     int exception_offset = -1;
1986     OopMapSet* oop_maps = new OopMapSet();
1987     int frame_complete = -1;
1988     int stack_slots = -1;
1989     int interpreted_entry_offset = -1;
1990     int vep_offset = -1;
1991     if (method->is_continuation_enter_intrinsic()) {
1992       gen_continuation_enter(masm,
1993                              in_regs,
1994                              exception_offset,
1995                              oop_maps,
1996                              frame_complete,
1997                              stack_slots,
1998                              interpreted_entry_offset,
1999                              vep_offset);
2000     } else if (method->is_continuation_yield_intrinsic()) {
2001       gen_continuation_yield(masm,
2002                              in_regs,
2003                              oop_maps,
2004                              frame_complete,
2005                              stack_slots,
2006                              vep_offset);
2007     } else {
2008       guarantee(false, "Unknown Continuation native intrinsic");
2009     }
2010 
2011 #ifdef ASSERT
2012     if (method->is_continuation_enter_intrinsic()) {
2013       assert(interpreted_entry_offset != -1, "Must be set");
2014       assert(exception_offset != -1,         "Must be set");
2015     } else {
2016       assert(interpreted_entry_offset == -1, "Must be unset");
2017       assert(exception_offset == -1,         "Must be unset");
2018     }
2019     assert(frame_complete != -1,    "Must be set");
2020     assert(stack_slots != -1,       "Must be set");
2021     assert(vep_offset != -1,        "Must be set");
2022 #endif
2023 
2024     __ flush();
2025     nmethod* nm = nmethod::new_native_nmethod(method,
2026                                               compile_id,
2027                                               masm->code(),
2028                                               vep_offset,
2029                                               frame_complete,
2030                                               stack_slots,
2031                                               in_ByteSize(-1),
2032                                               in_ByteSize(-1),
2033                                               oop_maps,
2034                                               exception_offset);
2035     if (nm == nullptr) return nm;
2036     if (method->is_continuation_enter_intrinsic()) {
2037       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
2038     } else if (method->is_continuation_yield_intrinsic()) {
2039       _cont_doYield_stub = nm;
2040     }
2041     return nm;
2042   }
2043 
2044   if (method->is_method_handle_intrinsic()) {
2045     vmIntrinsics::ID iid = method->intrinsic_id();
2046     intptr_t start = (intptr_t)__ pc();
2047     int vep_offset = ((intptr_t)__ pc()) - start;
2048     gen_special_dispatch(masm,
2049                          method,
2050                          in_sig_bt,
2051                          in_regs);
2052     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
2053     __ flush();
2054     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
2055     return nmethod::new_native_nmethod(method,
2056                                        compile_id,
2057                                        masm->code(),
2058                                        vep_offset,
2059                                        frame_complete,
2060                                        stack_slots / VMRegImpl::slots_per_word,
2061                                        in_ByteSize(-1),
2062                                        in_ByteSize(-1),
2063                                        nullptr);
2064   }
2065   address native_func = method->native_function();
2066   assert(native_func != nullptr, "must have function");
2067 
2068   // An OopMap for lock (and class if static)
2069   OopMapSet *oop_maps = new OopMapSet();
2070   intptr_t start = (intptr_t)__ pc();
2071 
2072   // We have received a description of where all the java arg are located
2073   // on entry to the wrapper. We need to convert these args to where
2074   // the jni function will expect them. To figure out where they go
2075   // we convert the java signature to a C signature by inserting
2076   // the hidden arguments as arg[0] and possibly arg[1] (static method)
2077 
2078   const int total_in_args = method->size_of_parameters();
2079   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
2080 
2081   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
2082   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
2083 
2084   int argc = 0;
2085   out_sig_bt[argc++] = T_ADDRESS;
2086   if (method->is_static()) {
2087     out_sig_bt[argc++] = T_OBJECT;
2088   }
2089 
2090   for (int i = 0; i < total_in_args ; i++ ) {
2091     out_sig_bt[argc++] = in_sig_bt[i];
2092   }
2093 
2094   // Now figure out where the args must be stored and how much stack space
2095   // they require.
2096   int out_arg_slots;
2097   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
2098 
2099   // Compute framesize for the wrapper.  We need to handlize all oops in
2100   // incoming registers
2101 
2102   // Calculate the total number of stack slots we will need.
2103 
2104   // First count the abi requirement plus all of the outgoing args
2105   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
2106 
2107   // Now the space for the inbound oop handle area
2108   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
2109 
2110   int oop_handle_offset = stack_slots;
2111   stack_slots += total_save_slots;
2112 
2113   // Now any space we need for handlizing a klass if static method
2114 
2115   int klass_slot_offset = 0;
2116   int klass_offset = -1;
2117   int lock_slot_offset = 0;
2118   bool is_static = false;
2119 
2120   if (method->is_static()) {
2121     klass_slot_offset = stack_slots;
2122     stack_slots += VMRegImpl::slots_per_word;
2123     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
2124     is_static = true;
2125   }
2126 
2127   // Plus a lock if needed
2128 
2129   if (method->is_synchronized()) {
2130     lock_slot_offset = stack_slots;
2131     stack_slots += VMRegImpl::slots_per_word;
2132   }
2133 
2134   // Now a place (+2) to save return values or temp during shuffling
2135   // + 4 for return address (which we own) and saved rbp
2136   stack_slots += 6;
2137 
2138   // Ok The space we have allocated will look like:
2139   //
2140   //
2141   // FP-> |                     |
2142   //      |---------------------|
2143   //      | 2 slots for moves   |
2144   //      |---------------------|
2145   //      | lock box (if sync)  |
2146   //      |---------------------| <- lock_slot_offset
2147   //      | klass (if static)   |
2148   //      |---------------------| <- klass_slot_offset
2149   //      | oopHandle area      |
2150   //      |---------------------| <- oop_handle_offset (6 java arg registers)
2151   //      | outbound memory     |
2152   //      | based arguments     |
2153   //      |                     |
2154   //      |---------------------|
2155   //      |                     |
2156   // SP-> | out_preserved_slots |
2157   //
2158   //
2159 
2160 
2161   // Now compute actual number of stack words we need rounding to make
2162   // stack properly aligned.
2163   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
2164 
2165   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
2166 
2167   // First thing make an ic check to see if we should even be here
2168 
2169   // We are free to use all registers as temps without saving them and
2170   // restoring them except rbp. rbp is the only callee save register
2171   // as far as the interpreter and the compiler(s) are concerned.
2172 
2173   const Register receiver = j_rarg0;
2174 
2175   Label exception_pending;
2176 
2177   assert_different_registers(receiver, rscratch1, rscratch2);
2178   __ verify_oop(receiver);
2179   __ ic_check(8 /* end_alignment */);
2180 
2181   int vep_offset = ((intptr_t)__ pc()) - start;
2182 
2183   if (method->needs_clinit_barrier()) {
2184     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
2185     Label L_skip_barrier;
2186     Register klass = r10;
2187     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
2188     __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
2189 
2190     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
2191 
2192     __ bind(L_skip_barrier);
2193   }
2194 
2195 #ifdef COMPILER1
2196   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2197   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2198     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2199   }
2200 #endif // COMPILER1
2201 
2202   // The instruction at the verified entry point must be 5 bytes or longer
2203   // because it can be patched on the fly by make_non_entrant. The stack bang
2204   // instruction fits that requirement.
2205 
2206   // Generate stack overflow check
2207   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
2208 
2209   // Generate a new frame for the wrapper.
2210   __ enter();
2211   // -2 because return address is already present and so is saved rbp
2212   __ subptr(rsp, stack_size - 2*wordSize);
2213 
2214   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2215   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
2216   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
2217 
2218   // Frame is now completed as far as size and linkage.
2219   int frame_complete = ((intptr_t)__ pc()) - start;
2220 
2221 #ifdef ASSERT
2222   __ check_stack_alignment(rsp, "improperly aligned stack");
2223 #endif /* ASSERT */
2224 
2225 
2226   // We use r14 as the oop handle for the receiver/klass
2227   // It is callee save so it survives the call to native
2228 
2229   const Register oop_handle_reg = r14;
2230 
2231   //
2232   // We immediately shuffle the arguments so that any vm call we have to
2233   // make from here on out (sync slow path, jvmti, etc.) we will have
2234   // captured the oops from our caller and have a valid oopMap for
2235   // them.
2236 
2237   // -----------------
2238   // The Grand Shuffle
2239 
2240   // The Java calling convention is either equal (linux) or denser (win64) than the
2241   // c calling convention. However the because of the jni_env argument the c calling
2242   // convention always has at least one more (and two for static) arguments than Java.
2243   // Therefore if we move the args from java -> c backwards then we will never have
2244   // a register->register conflict and we don't have to build a dependency graph
2245   // and figure out how to break any cycles.
2246   //
2247 
2248   // Record esp-based slot for receiver on stack for non-static methods
2249   int receiver_offset = -1;
2250 
2251   // This is a trick. We double the stack slots so we can claim
2252   // the oops in the caller's frame. Since we are sure to have
2253   // more args than the caller doubling is enough to make
2254   // sure we can capture all the incoming oop args from the
2255   // caller.
2256   //
2257   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2258 
2259   // Mark location of rbp (someday)
2260   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2261 
2262   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2263   // All inbound args are referenced based on rbp and all outbound args via rsp.
2264 
2265 
2266 #ifdef ASSERT
2267   bool reg_destroyed[Register::number_of_registers];
2268   bool freg_destroyed[XMMRegister::number_of_registers];
2269   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2270     reg_destroyed[r] = false;
2271   }
2272   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2273     freg_destroyed[f] = false;
2274   }
2275 
2276 #endif /* ASSERT */
2277 
2278   // For JNI natives the incoming and outgoing registers are offset upwards.
2279   GrowableArray<int> arg_order(2 * total_in_args);
2280 
2281   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2282     arg_order.push(i);
2283     arg_order.push(c_arg);
2284   }
2285 
2286   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2287     int i = arg_order.at(ai);
2288     int c_arg = arg_order.at(ai + 1);
2289     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2290 #ifdef ASSERT
2291     if (in_regs[i].first()->is_Register()) {
2292       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2293     } else if (in_regs[i].first()->is_XMMRegister()) {
2294       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2295     }
2296     if (out_regs[c_arg].first()->is_Register()) {
2297       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2298     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2299       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2300     }
2301 #endif /* ASSERT */
2302     switch (in_sig_bt[i]) {
2303       case T_ARRAY:
2304       case T_OBJECT:
2305         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2306                     ((i == 0) && (!is_static)),
2307                     &receiver_offset);
2308         break;
2309       case T_VOID:
2310         break;
2311 
2312       case T_FLOAT:
2313         __ float_move(in_regs[i], out_regs[c_arg]);
2314           break;
2315 
2316       case T_DOUBLE:
2317         assert( i + 1 < total_in_args &&
2318                 in_sig_bt[i + 1] == T_VOID &&
2319                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2320         __ double_move(in_regs[i], out_regs[c_arg]);
2321         break;
2322 
2323       case T_LONG :
2324         __ long_move(in_regs[i], out_regs[c_arg]);
2325         break;
2326 
2327       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2328 
2329       default:
2330         __ move32_64(in_regs[i], out_regs[c_arg]);
2331     }
2332   }
2333 
2334   int c_arg;
2335 
2336   // Pre-load a static method's oop into r14.  Used both by locking code and
2337   // the normal JNI call code.
2338   // point c_arg at the first arg that is already loaded in case we
2339   // need to spill before we call out
2340   c_arg = total_c_args - total_in_args;
2341 
2342   if (method->is_static()) {
2343 
2344     //  load oop into a register
2345     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2346 
2347     // Now handlize the static class mirror it's known not-null.
2348     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2349     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2350 
2351     // Now get the handle
2352     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2353     // store the klass handle as second argument
2354     __ movptr(c_rarg1, oop_handle_reg);
2355     // and protect the arg if we must spill
2356     c_arg--;
2357   }
2358 
2359   // Change state to native (we save the return address in the thread, since it might not
2360   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2361   // points into the right code segment. It does not have to be the correct return pc.
2362   // We use the same pc/oopMap repeatedly when we call out
2363 
2364   Label native_return;
2365   if (method->is_object_wait0()) {
2366     // For convenience we use the pc we want to resume to in case of preemption on Object.wait.
2367     __ set_last_Java_frame(rsp, noreg, native_return, rscratch1);
2368   } else {
2369     intptr_t the_pc = (intptr_t) __ pc();
2370     oop_maps->add_gc_map(the_pc - start, map);
2371 
2372     __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1);
2373   }
2374 
2375   // We have all of the arguments setup at this point. We must not touch any register
2376   // argument registers at this point (what if we save/restore them there are no oop?
2377 
2378   if (DTraceMethodProbes) {
2379     // protect the args we've loaded
2380     save_args(masm, total_c_args, c_arg, out_regs);
2381     __ mov_metadata(c_rarg1, method());
2382     __ call_VM_leaf(
2383       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2384       r15_thread, c_rarg1);
2385     restore_args(masm, total_c_args, c_arg, out_regs);
2386   }
2387 
2388   // RedefineClasses() tracing support for obsolete method entry
2389   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2390     // protect the args we've loaded
2391     save_args(masm, total_c_args, c_arg, out_regs);
2392     __ mov_metadata(c_rarg1, method());
2393     __ call_VM_leaf(
2394       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2395       r15_thread, c_rarg1);
2396     restore_args(masm, total_c_args, c_arg, out_regs);
2397   }
2398 
2399   // Lock a synchronized method
2400 
2401   // Register definitions used by locking and unlocking
2402 
2403   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2404   const Register obj_reg  = rbx;  // Will contain the oop
2405   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2406 
2407   Label slow_path_lock;
2408   Label lock_done;
2409 
2410   if (method->is_synchronized()) {
2411     // Get the handle (the 2nd argument)
2412     __ mov(oop_handle_reg, c_rarg1);
2413 
2414     // Get address of the box
2415 
2416     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2417 
2418     // Load the oop from the handle
2419     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2420 
2421     __ fast_lock(lock_reg, obj_reg, swap_reg, rscratch1, slow_path_lock);
2422 
2423     // Slow path will re-enter here
2424     __ bind(lock_done);
2425   }
2426 
2427   // Finally just about ready to make the JNI call
2428 
2429   // get JNIEnv* which is first argument to native
2430   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2431 
2432   // Now set thread in native
2433   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2434 
2435   __ call(RuntimeAddress(native_func));
2436 
2437   // Verify or restore cpu control state after JNI call
2438   __ restore_cpu_control_state_after_jni(rscratch1);
2439 
2440   // Unpack native results.
2441   switch (ret_type) {
2442   case T_BOOLEAN: __ c2bool(rax);            break;
2443   case T_CHAR   : __ movzwl(rax, rax);      break;
2444   case T_BYTE   : __ sign_extend_byte (rax); break;
2445   case T_SHORT  : __ sign_extend_short(rax); break;
2446   case T_INT    : /* nothing to do */        break;
2447   case T_DOUBLE :
2448   case T_FLOAT  :
2449     // Result is in xmm0 we'll save as needed
2450     break;
2451   case T_ARRAY:                 // Really a handle
2452   case T_OBJECT:                // Really a handle
2453       break; // can't de-handlize until after safepoint check
2454   case T_VOID: break;
2455   case T_LONG: break;
2456   default       : ShouldNotReachHere();
2457   }
2458 
2459   // Switch thread to "native transition" state before reading the synchronization state.
2460   // This additional state is necessary because reading and testing the synchronization
2461   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2462   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2463   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2464   //     Thread A is resumed to finish this native method, but doesn't block here since it
2465   //     didn't see any synchronization is progress, and escapes.
2466   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2467 
2468   // Force this write out before the read below
2469   if (!UseSystemMemoryBarrier) {
2470     __ membar(Assembler::Membar_mask_bits(
2471               Assembler::LoadLoad | Assembler::LoadStore |
2472               Assembler::StoreLoad | Assembler::StoreStore));
2473   }
2474 
2475   // check for safepoint operation in progress and/or pending suspend requests
2476   {
2477     Label Continue;
2478     Label slow_path;
2479 
2480     __ safepoint_poll(slow_path, true /* at_return */, false /* in_nmethod */);
2481 
2482     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2483     __ jcc(Assembler::equal, Continue);
2484     __ bind(slow_path);
2485 
2486     // Don't use call_VM as it will see a possible pending exception and forward it
2487     // and never return here preventing us from clearing _last_native_pc down below.
2488     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2489     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2490     // by hand.
2491     //
2492     __ vzeroupper();
2493     save_native_result(masm, ret_type, stack_slots);
2494     __ mov(c_rarg0, r15_thread);
2495     __ mov(r12, rsp); // remember sp
2496     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2497     __ andptr(rsp, -16); // align stack as required by ABI
2498     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2499     __ mov(rsp, r12); // restore sp
2500     __ reinit_heapbase();
2501     // Restore any method result value
2502     restore_native_result(masm, ret_type, stack_slots);
2503     __ bind(Continue);
2504   }
2505 
2506   // change thread state
2507   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2508 
2509   if (method->is_object_wait0()) {
2510     // Check preemption for Object.wait()
2511     __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset()));
2512     __ cmpptr(rscratch1, NULL_WORD);
2513     __ jccb(Assembler::equal, native_return);
2514     __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD);
2515     __ jmp(rscratch1);
2516     __ bind(native_return);
2517 
2518     intptr_t the_pc = (intptr_t) __ pc();
2519     oop_maps->add_gc_map(the_pc - start, map);
2520   }
2521 
2522 
2523   Label reguard;
2524   Label reguard_done;
2525   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2526   __ jcc(Assembler::equal, reguard);
2527   __ bind(reguard_done);
2528 
2529   // native result if any is live
2530 
2531   // Unlock
2532   Label slow_path_unlock;
2533   Label unlock_done;
2534   if (method->is_synchronized()) {
2535 
2536     Label fast_done;
2537 
2538     // Get locked oop from the handle we passed to jni
2539     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2540 
2541     // Must save rax if it is live now because cmpxchg must use it
2542     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2543       save_native_result(masm, ret_type, stack_slots);
2544     }
2545 
2546     __ fast_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2547 
2548     // slow path re-enters here
2549     __ bind(unlock_done);
2550     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2551       restore_native_result(masm, ret_type, stack_slots);
2552     }
2553 
2554     __ bind(fast_done);
2555   }
2556   if (DTraceMethodProbes) {
2557     save_native_result(masm, ret_type, stack_slots);
2558     __ mov_metadata(c_rarg1, method());
2559     __ call_VM_leaf(
2560          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2561          r15_thread, c_rarg1);
2562     restore_native_result(masm, ret_type, stack_slots);
2563   }
2564 
2565   __ reset_last_Java_frame(false);
2566 
2567   // Unbox oop result, e.g. JNIHandles::resolve value.
2568   if (is_reference_type(ret_type)) {
2569     __ resolve_jobject(rax /* value */,
2570                        rcx /* tmp */);
2571   }
2572 
2573   if (CheckJNICalls) {
2574     // clear_pending_jni_exception_check
2575     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2576   }
2577 
2578   // reset handle block
2579   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2580   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2581 
2582   // pop our frame
2583 
2584   __ leave();
2585 
2586 #if INCLUDE_JFR
2587   // We need to do a poll test after unwind in case the sampler
2588   // managed to sample the native frame after returning to Java.
2589   Label L_return;
2590   address poll_test_pc = __ pc();
2591   __ relocate(relocInfo::poll_return_type);
2592   __ testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2593   __ jccb(Assembler::zero, L_return);
2594   __ lea(rscratch1, InternalAddress(poll_test_pc));
2595   __ movptr(Address(r15_thread, JavaThread::saved_exception_pc_offset()), rscratch1);
2596   assert(SharedRuntime::polling_page_return_handler_blob() != nullptr,
2597     "polling page return stub not created yet");
2598   address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
2599   __ jump(RuntimeAddress(stub));
2600   __ bind(L_return);
2601 #endif // INCLUDE_JFR
2602 
2603   // Any exception pending?
2604   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2605   __ jcc(Assembler::notEqual, exception_pending);
2606 
2607   // Return
2608 
2609   __ ret(0);
2610 
2611   // Unexpected paths are out of line and go here
2612 
2613   // forward the exception
2614   __ bind(exception_pending);
2615 
2616   // and forward the exception
2617   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2618 
2619   // Slow path locking & unlocking
2620   if (method->is_synchronized()) {
2621 
2622     // BEGIN Slow path lock
2623     __ bind(slow_path_lock);
2624 
2625     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2626     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2627 
2628     // protect the args we've loaded
2629     save_args(masm, total_c_args, c_arg, out_regs);
2630 
2631     __ mov(c_rarg0, obj_reg);
2632     __ mov(c_rarg1, lock_reg);
2633     __ mov(c_rarg2, r15_thread);
2634 
2635     // Not a leaf but we have last_Java_frame setup as we want.
2636     // We don't want to unmount in case of contention since that would complicate preserving
2637     // the arguments that had already been marshalled into the native convention. So we force
2638     // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame())
2639     // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack.
2640     __ push_cont_fastpath();
2641     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2642     __ pop_cont_fastpath();
2643     restore_args(masm, total_c_args, c_arg, out_regs);
2644 
2645 #ifdef ASSERT
2646     { Label L;
2647     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2648     __ jcc(Assembler::equal, L);
2649     __ stop("no pending exception allowed on exit from monitorenter");
2650     __ bind(L);
2651     }
2652 #endif
2653     __ jmp(lock_done);
2654 
2655     // END Slow path lock
2656 
2657     // BEGIN Slow path unlock
2658     __ bind(slow_path_unlock);
2659 
2660     // If we haven't already saved the native result we must save it now as xmm registers
2661     // are still exposed.
2662     __ vzeroupper();
2663     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2664       save_native_result(masm, ret_type, stack_slots);
2665     }
2666 
2667     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2668 
2669     __ mov(c_rarg0, obj_reg);
2670     __ mov(c_rarg2, r15_thread);
2671     __ mov(r12, rsp); // remember sp
2672     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2673     __ andptr(rsp, -16); // align stack as required by ABI
2674 
2675     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2676     // NOTE that obj_reg == rbx currently
2677     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2678     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2679 
2680     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2681     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2682     __ mov(rsp, r12); // restore sp
2683     __ reinit_heapbase();
2684 #ifdef ASSERT
2685     {
2686       Label L;
2687       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2688       __ jcc(Assembler::equal, L);
2689       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2690       __ bind(L);
2691     }
2692 #endif /* ASSERT */
2693 
2694     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2695 
2696     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2697       restore_native_result(masm, ret_type, stack_slots);
2698     }
2699     __ jmp(unlock_done);
2700 
2701     // END Slow path unlock
2702 
2703   } // synchronized
2704 
2705   // SLOW PATH Reguard the stack if needed
2706 
2707   __ bind(reguard);
2708   __ vzeroupper();
2709   save_native_result(masm, ret_type, stack_slots);
2710   __ mov(r12, rsp); // remember sp
2711   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2712   __ andptr(rsp, -16); // align stack as required by ABI
2713   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2714   __ mov(rsp, r12); // restore sp
2715   __ reinit_heapbase();
2716   restore_native_result(masm, ret_type, stack_slots);
2717   // and continue
2718   __ jmp(reguard_done);
2719 
2720 
2721 
2722   __ flush();
2723 
2724   nmethod *nm = nmethod::new_native_nmethod(method,
2725                                             compile_id,
2726                                             masm->code(),
2727                                             vep_offset,
2728                                             frame_complete,
2729                                             stack_slots / VMRegImpl::slots_per_word,
2730                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2731                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2732                                             oop_maps);
2733 
2734   return nm;
2735 }
2736 
2737 // this function returns the adjust size (in number of words) to a c2i adapter
2738 // activation for use during deoptimization
2739 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2740   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2741 }
2742 
2743 
2744 uint SharedRuntime::out_preserve_stack_slots() {
2745   return 0;
2746 }
2747 
2748 
2749 // Number of stack slots between incoming argument block and the start of
2750 // a new frame.  The PROLOG must add this many slots to the stack.  The
2751 // EPILOG must remove this many slots.  amd64 needs two slots for
2752 // return address.
2753 uint SharedRuntime::in_preserve_stack_slots() {
2754   return 4 + 2 * VerifyStackAtCalls;
2755 }
2756 
2757 VMReg SharedRuntime::thread_register() {
2758   return r15_thread->as_VMReg();
2759 }
2760 
2761 //------------------------------generate_deopt_blob----------------------------
2762 void SharedRuntime::generate_deopt_blob() {
2763   // Allocate space for the code
2764   ResourceMark rm;
2765   // Setup code generation tools
2766   int pad = 0;
2767   if (UseAVX > 2) {
2768     pad += 1024;
2769   }
2770   if (UseAPX) {
2771     pad += 1024;
2772   }
2773 #if INCLUDE_JVMCI
2774   if (EnableJVMCI) {
2775     pad += 512; // Increase the buffer size when compiling for JVMCI
2776   }
2777 #endif
2778   const char* name = SharedRuntime::stub_name(StubId::shared_deopt_id);
2779   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
2780   if (blob != nullptr) {
2781     _deopt_blob = blob->as_deoptimization_blob();
2782     return;
2783   }
2784 
2785   CodeBuffer buffer(name, 2560+pad, 1024);
2786   MacroAssembler* masm = new MacroAssembler(&buffer);
2787   int frame_size_in_words;
2788   OopMap* map = nullptr;
2789   OopMapSet *oop_maps = new OopMapSet();
2790 
2791   // -------------
2792   // This code enters when returning to a de-optimized nmethod.  A return
2793   // address has been pushed on the stack, and return values are in
2794   // registers.
2795   // If we are doing a normal deopt then we were called from the patched
2796   // nmethod from the point we returned to the nmethod. So the return
2797   // address on the stack is wrong by NativeCall::instruction_size
2798   // We will adjust the value so it looks like we have the original return
2799   // address on the stack (like when we eagerly deoptimized).
2800   // In the case of an exception pending when deoptimizing, we enter
2801   // with a return address on the stack that points after the call we patched
2802   // into the exception handler. We have the following register state from,
2803   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2804   //    rax: exception oop
2805   //    rbx: exception handler
2806   //    rdx: throwing pc
2807   // So in this case we simply jam rdx into the useless return address and
2808   // the stack looks just like we want.
2809   //
2810   // At this point we need to de-opt.  We save the argument return
2811   // registers.  We call the first C routine, fetch_unroll_info().  This
2812   // routine captures the return values and returns a structure which
2813   // describes the current frame size and the sizes of all replacement frames.
2814   // The current frame is compiled code and may contain many inlined
2815   // functions, each with their own JVM state.  We pop the current frame, then
2816   // push all the new frames.  Then we call the C routine unpack_frames() to
2817   // populate these frames.  Finally unpack_frames() returns us the new target
2818   // address.  Notice that callee-save registers are BLOWN here; they have
2819   // already been captured in the vframeArray at the time the return PC was
2820   // patched.
2821   address start = __ pc();
2822   Label cont;
2823 
2824   // Prolog for non exception case!
2825 
2826   // Save everything in sight.
2827   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2828 
2829   // Normal deoptimization.  Save exec mode for unpack_frames.
2830   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2831   __ jmp(cont);
2832 
2833   int reexecute_offset = __ pc() - start;
2834 #if INCLUDE_JVMCI && !defined(COMPILER1)
2835   if (UseJVMCICompiler) {
2836     // JVMCI does not use this kind of deoptimization
2837     __ should_not_reach_here();
2838   }
2839 #endif
2840 
2841   // Reexecute case
2842   // return address is the pc describes what bci to do re-execute at
2843 
2844   // No need to update map as each call to save_live_registers will produce identical oopmap
2845   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2846 
2847   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2848   __ jmp(cont);
2849 
2850 #if INCLUDE_JVMCI
2851   Label after_fetch_unroll_info_call;
2852   int implicit_exception_uncommon_trap_offset = 0;
2853   int uncommon_trap_offset = 0;
2854 
2855   if (EnableJVMCI) {
2856     implicit_exception_uncommon_trap_offset = __ pc() - start;
2857 
2858     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2859     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2860 
2861     uncommon_trap_offset = __ pc() - start;
2862 
2863     // Save everything in sight.
2864     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2865     // fetch_unroll_info needs to call last_java_frame()
2866     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2867 
2868     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2869     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2870 
2871     __ movl(r14, Deoptimization::Unpack_reexecute);
2872     __ mov(c_rarg0, r15_thread);
2873     __ movl(c_rarg2, r14); // exec mode
2874     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2875     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2876 
2877     __ reset_last_Java_frame(false);
2878 
2879     __ jmp(after_fetch_unroll_info_call);
2880   } // EnableJVMCI
2881 #endif // INCLUDE_JVMCI
2882 
2883   int exception_offset = __ pc() - start;
2884 
2885   // Prolog for exception case
2886 
2887   // all registers are dead at this entry point, except for rax, and
2888   // rdx which contain the exception oop and exception pc
2889   // respectively.  Set them in TLS and fall thru to the
2890   // unpack_with_exception_in_tls entry point.
2891 
2892   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2893   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2894 
2895   int exception_in_tls_offset = __ pc() - start;
2896 
2897   // new implementation because exception oop is now passed in JavaThread
2898 
2899   // Prolog for exception case
2900   // All registers must be preserved because they might be used by LinearScan
2901   // Exceptiop oop and throwing PC are passed in JavaThread
2902   // tos: stack at point of call to method that threw the exception (i.e. only
2903   // args are on the stack, no return address)
2904 
2905   // make room on stack for the return address
2906   // It will be patched later with the throwing pc. The correct value is not
2907   // available now because loading it from memory would destroy registers.
2908   __ push(0);
2909 
2910   // Save everything in sight.
2911   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2912 
2913   // Now it is safe to overwrite any register
2914 
2915   // Deopt during an exception.  Save exec mode for unpack_frames.
2916   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2917 
2918   // load throwing pc from JavaThread and patch it as the return address
2919   // of the current frame. Then clear the field in JavaThread
2920 
2921   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2922   __ movptr(Address(rbp, wordSize), rdx);
2923   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2924 
2925 #ifdef ASSERT
2926   // verify that there is really an exception oop in JavaThread
2927   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2928   __ verify_oop(rax);
2929 
2930   // verify that there is no pending exception
2931   Label no_pending_exception;
2932   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2933   __ testptr(rax, rax);
2934   __ jcc(Assembler::zero, no_pending_exception);
2935   __ stop("must not have pending exception here");
2936   __ bind(no_pending_exception);
2937 #endif
2938 
2939   __ bind(cont);
2940 
2941   // Call C code.  Need thread and this frame, but NOT official VM entry
2942   // crud.  We cannot block on this call, no GC can happen.
2943   //
2944   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2945 
2946   // fetch_unroll_info needs to call last_java_frame().
2947 
2948   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2949 #ifdef ASSERT
2950   { Label L;
2951     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2952     __ jcc(Assembler::equal, L);
2953     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2954     __ bind(L);
2955   }
2956 #endif // ASSERT
2957   __ mov(c_rarg0, r15_thread);
2958   __ movl(c_rarg1, r14); // exec_mode
2959   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2960 
2961   // Need to have an oopmap that tells fetch_unroll_info where to
2962   // find any register it might need.
2963   oop_maps->add_gc_map(__ pc() - start, map);
2964 
2965   __ reset_last_Java_frame(false);
2966 
2967 #if INCLUDE_JVMCI
2968   if (EnableJVMCI) {
2969     __ bind(after_fetch_unroll_info_call);
2970   }
2971 #endif
2972 
2973   // Load UnrollBlock* into rdi
2974   __ mov(rdi, rax);
2975 
2976   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2977    Label noException;
2978   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2979   __ jcc(Assembler::notEqual, noException);
2980   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2981   // QQQ this is useless it was null above
2982   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2983   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2984   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2985 
2986   __ verify_oop(rax);
2987 
2988   // Overwrite the result registers with the exception results.
2989   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2990   // I think this is useless
2991   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2992 
2993   __ bind(noException);
2994 
2995   // Only register save data is on the stack.
2996   // Now restore the result registers.  Everything else is either dead
2997   // or captured in the vframeArray.
2998   RegisterSaver::restore_result_registers(masm);
2999 
3000   // All of the register save area has been popped of the stack. Only the
3001   // return address remains.
3002 
3003   // Pop all the frames we must move/replace.
3004   //
3005   // Frame picture (youngest to oldest)
3006   // 1: self-frame (no frame link)
3007   // 2: deopting frame  (no frame link)
3008   // 3: caller of deopting frame (could be compiled/interpreted).
3009   //
3010   // Note: by leaving the return address of self-frame on the stack
3011   // and using the size of frame 2 to adjust the stack
3012   // when we are done the return to frame 3 will still be on the stack.
3013 
3014   // Pop deoptimized frame
3015   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
3016   __ addptr(rsp, rcx);
3017 
3018   // rsp should be pointing at the return address to the caller (3)
3019 
3020   // Pick up the initial fp we should save
3021   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3022   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
3023 
3024 #ifdef ASSERT
3025   // Compilers generate code that bang the stack by as much as the
3026   // interpreter would need. So this stack banging should never
3027   // trigger a fault. Verify that it does not on non product builds.
3028   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
3029   __ bang_stack_size(rbx, rcx);
3030 #endif
3031 
3032   // Load address of array of frame pcs into rcx
3033   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
3034 
3035   // Trash the old pc
3036   __ addptr(rsp, wordSize);
3037 
3038   // Load address of array of frame sizes into rsi
3039   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
3040 
3041   // Load counter into rdx
3042   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
3043 
3044   // Now adjust the caller's stack to make up for the extra locals
3045   // but record the original sp so that we can save it in the skeletal interpreter
3046   // frame and the stack walking of interpreter_sender will get the unextended sp
3047   // value and not the "real" sp value.
3048 
3049   const Register sender_sp = r8;
3050 
3051   __ mov(sender_sp, rsp);
3052   __ movl(rbx, Address(rdi,
3053                        Deoptimization::UnrollBlock::
3054                        caller_adjustment_offset()));
3055   __ subptr(rsp, rbx);
3056 
3057   // Push interpreter frames in a loop
3058   Label loop;
3059   __ bind(loop);
3060   __ movptr(rbx, Address(rsi, 0));      // Load frame size
3061   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
3062   __ pushptr(Address(rcx, 0));          // Save return address
3063   __ enter();                           // Save old & set new ebp
3064   __ subptr(rsp, rbx);                  // Prolog
3065   // This value is corrected by layout_activation_impl
3066   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3067   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
3068   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
3069   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
3070   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
3071   __ decrementl(rdx);                   // Decrement counter
3072   __ jcc(Assembler::notZero, loop);
3073   __ pushptr(Address(rcx, 0));          // Save final return address
3074 
3075   // Re-push self-frame
3076   __ enter();                           // Save old & set new ebp
3077 
3078   // Allocate a full sized register save area.
3079   // Return address and rbp are in place, so we allocate two less words.
3080   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
3081 
3082   // Restore frame locals after moving the frame
3083   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
3084   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3085 
3086   // Call C code.  Need thread but NOT official VM entry
3087   // crud.  We cannot block on this call, no GC can happen.  Call should
3088   // restore return values to their stack-slots with the new SP.
3089   //
3090   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
3091 
3092   // Use rbp because the frames look interpreted now
3093   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3094   // Don't need the precise return PC here, just precise enough to point into this code blob.
3095   address the_pc = __ pc();
3096   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3097 
3098   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
3099   __ mov(c_rarg0, r15_thread);
3100   __ movl(c_rarg1, r14); // second arg: exec_mode
3101   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3102   // Revert SP alignment after call since we're going to do some SP relative addressing below
3103   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
3104 
3105   // Set an oopmap for the call site
3106   // Use the same PC we used for the last java frame
3107   oop_maps->add_gc_map(the_pc - start,
3108                        new OopMap( frame_size_in_words, 0 ));
3109 
3110   // Clear fp AND pc
3111   __ reset_last_Java_frame(true);
3112 
3113   // Collect return values
3114   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
3115   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
3116   // I think this is useless (throwing pc?)
3117   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
3118 
3119   // Pop self-frame.
3120   __ leave();                           // Epilog
3121 
3122   // Jump to interpreter
3123   __ ret(0);
3124 
3125   // Make sure all code is generated
3126   masm->flush();
3127 
3128   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
3129   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
3130 #if INCLUDE_JVMCI
3131   if (EnableJVMCI) {
3132     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
3133     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
3134   }
3135 #endif
3136 
3137   AOTCodeCache::store_code_blob(*_deopt_blob, AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
3138 }
3139 
3140 //------------------------------generate_handler_blob------
3141 //
3142 // Generate a special Compile2Runtime blob that saves all registers,
3143 // and setup oopmap.
3144 //
3145 SafepointBlob* SharedRuntime::generate_handler_blob(StubId id, address call_ptr) {
3146   assert(StubRoutines::forward_exception_entry() != nullptr,
3147          "must be generated before");
3148   assert(is_polling_page_id(id), "expected a polling page stub id");
3149 
3150   // Allocate space for the code.  Setup code generation tools.
3151   const char* name = SharedRuntime::stub_name(id);
3152   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3153   if (blob != nullptr) {
3154     return blob->as_safepoint_blob();
3155   }
3156 
3157   ResourceMark rm;
3158   OopMapSet *oop_maps = new OopMapSet();
3159   OopMap* map;
3160   CodeBuffer buffer(name, 2548, 1024);
3161   MacroAssembler* masm = new MacroAssembler(&buffer);
3162 
3163   address start   = __ pc();
3164   address call_pc = nullptr;
3165   int frame_size_in_words;
3166   bool cause_return = (id == StubId::shared_polling_page_return_handler_id);
3167   bool save_wide_vectors = (id == StubId::shared_polling_page_vectors_safepoint_handler_id);
3168 
3169   // Make room for return address (or push it again)
3170   if (!cause_return) {
3171     __ push(rbx);
3172   }
3173 
3174   // Save registers, fpu state, and flags
3175   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3176 
3177   // The following is basically a call_VM.  However, we need the precise
3178   // address of the call in order to generate an oopmap. Hence, we do all the
3179   // work ourselves.
3180 
3181   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3182 
3183   // The return address must always be correct so that frame constructor never
3184   // sees an invalid pc.
3185 
3186   if (!cause_return) {
3187     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3188     // Additionally, rbx is a callee saved register and we can look at it later to determine
3189     // if someone changed the return address for us!
3190     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3191     __ movptr(Address(rbp, wordSize), rbx);
3192   }
3193 
3194   // Do the call
3195   __ mov(c_rarg0, r15_thread);
3196   __ call(RuntimeAddress(call_ptr));
3197 
3198   // Set an oopmap for the call site.  This oopmap will map all
3199   // oop-registers and debug-info registers as callee-saved.  This
3200   // will allow deoptimization at this safepoint to find all possible
3201   // debug-info recordings, as well as let GC find all oops.
3202 
3203   oop_maps->add_gc_map( __ pc() - start, map);
3204 
3205   Label noException;
3206 
3207   __ reset_last_Java_frame(false);
3208 
3209   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3210   __ jcc(Assembler::equal, noException);
3211 
3212   // Exception pending
3213 
3214   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3215 
3216   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3217 
3218   // No exception case
3219   __ bind(noException);
3220 
3221   Label no_adjust;
3222 #ifdef ASSERT
3223   Label bail;
3224 #endif
3225   if (!cause_return) {
3226     Label no_prefix, not_special, check_rex_prefix;
3227 
3228     // If our stashed return pc was modified by the runtime we avoid touching it
3229     __ cmpptr(rbx, Address(rbp, wordSize));
3230     __ jcc(Assembler::notEqual, no_adjust);
3231 
3232     // Skip over the poll instruction.
3233     // See NativeInstruction::is_safepoint_poll()
3234     // Possible encodings:
3235     //      85 00       test   %eax,(%rax)
3236     //      85 01       test   %eax,(%rcx)
3237     //      85 02       test   %eax,(%rdx)
3238     //      85 03       test   %eax,(%rbx)
3239     //      85 06       test   %eax,(%rsi)
3240     //      85 07       test   %eax,(%rdi)
3241     //
3242     //   41 85 00       test   %eax,(%r8)
3243     //   41 85 01       test   %eax,(%r9)
3244     //   41 85 02       test   %eax,(%r10)
3245     //   41 85 03       test   %eax,(%r11)
3246     //   41 85 06       test   %eax,(%r14)
3247     //   41 85 07       test   %eax,(%r15)
3248     //
3249     //      85 04 24    test   %eax,(%rsp)
3250     //   41 85 04 24    test   %eax,(%r12)
3251     //      85 45 00    test   %eax,0x0(%rbp)
3252     //   41 85 45 00    test   %eax,0x0(%r13)
3253     //
3254     // Notes:
3255     //  Format of legacy MAP0 test instruction:-
3256     //  [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32]
3257     //  o  For safepoint polling instruction "test %eax,(%rax)", encoding of first register
3258     //     operand and base register of memory operand is b/w [0-8), hence we do not require
3259     //     additional REX prefix where REX.B bit stores MSB bit of register encoding, which
3260     //     is why two bytes encoding is sufficient here.
3261     //  o  For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE
3262     //     register of memory operand is 1000, thus we need additional REX prefix in this case,
3263     //     there by adding additional byte to instruction encoding.
3264     //  o  In case BASE register is one of the 32 extended GPR registers available only on targets
3265     //     supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold
3266     //     most significant two bits of 5 bit register encoding.
3267 
3268     if (VM_Version::supports_apx_f()) {
3269       __ cmpb(Address(rbx, 0), Assembler::REX2);
3270       __ jccb(Assembler::notEqual, check_rex_prefix);
3271       __ addptr(rbx, 2);
3272       __ bind(check_rex_prefix);
3273     }
3274     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3275     __ jccb(Assembler::notEqual, no_prefix);
3276     __ addptr(rbx, 1);
3277     __ bind(no_prefix);
3278 #ifdef ASSERT
3279     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3280 #endif
3281     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3282     // r12/rsp 0x04
3283     // r13/rbp 0x05
3284     __ movzbq(rcx, Address(rbx, 1));
3285     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3286     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3287     __ cmpptr(rcx, 1);
3288     __ jccb(Assembler::above, not_special);
3289     __ addptr(rbx, 1);
3290     __ bind(not_special);
3291 #ifdef ASSERT
3292     // Verify the correct encoding of the poll we're about to skip.
3293     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3294     __ jcc(Assembler::notEqual, bail);
3295     // Mask out the modrm bits
3296     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3297     // rax encodes to 0, so if the bits are nonzero it's incorrect
3298     __ jcc(Assembler::notZero, bail);
3299 #endif
3300     // Adjust return pc forward to step over the safepoint poll instruction
3301     __ addptr(rbx, 2);
3302     __ movptr(Address(rbp, wordSize), rbx);
3303   }
3304 
3305   __ bind(no_adjust);
3306   // Normal exit, restore registers and exit.
3307   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3308   __ ret(0);
3309 
3310 #ifdef ASSERT
3311   __ bind(bail);
3312   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3313 #endif
3314 
3315   // Make sure all code is generated
3316   masm->flush();
3317 
3318   // Fill-out other meta info
3319   SafepointBlob* sp_blob = SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3320 
3321   AOTCodeCache::store_code_blob(*sp_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3322   return sp_blob;
3323 }
3324 
3325 //
3326 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3327 //
3328 // Generate a stub that calls into vm to find out the proper destination
3329 // of a java call. All the argument registers are live at this point
3330 // but since this is generic code we don't know what they are and the caller
3331 // must do any gc of the args.
3332 //
3333 RuntimeStub* SharedRuntime::generate_resolve_blob(StubId id, address destination) {
3334   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3335   assert(is_resolve_id(id), "expected a resolve stub id");
3336 
3337   const char* name = SharedRuntime::stub_name(id);
3338   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3339   if (blob != nullptr) {
3340     return blob->as_runtime_stub();
3341   }
3342 
3343   // allocate space for the code
3344   ResourceMark rm;
3345   CodeBuffer buffer(name, 1552, 512);
3346   MacroAssembler* masm = new MacroAssembler(&buffer);
3347 
3348   int frame_size_in_words;
3349 
3350   OopMapSet *oop_maps = new OopMapSet();
3351   OopMap* map = nullptr;
3352 
3353   int start = __ offset();
3354 
3355   // No need to save vector registers since they are caller-saved anyway.
3356   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3357 
3358   int frame_complete = __ offset();
3359 
3360   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3361 
3362   __ mov(c_rarg0, r15_thread);
3363 
3364   __ call(RuntimeAddress(destination));
3365 
3366 
3367   // Set an oopmap for the call site.
3368   // We need this not only for callee-saved registers, but also for volatile
3369   // registers that the compiler might be keeping live across a safepoint.
3370 
3371   oop_maps->add_gc_map( __ offset() - start, map);
3372 
3373   // rax contains the address we are going to jump to assuming no exception got installed
3374 
3375   // clear last_Java_sp
3376   __ reset_last_Java_frame(false);
3377   // check for pending exceptions
3378   Label pending;
3379   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3380   __ jcc(Assembler::notEqual, pending);
3381 
3382   // get the returned Method*
3383   __ get_vm_result_metadata(rbx);
3384   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3385 
3386   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3387 
3388   RegisterSaver::restore_live_registers(masm);
3389 
3390   // We are back to the original state on entry and ready to go.
3391 
3392   __ jmp(rax);
3393 
3394   // Pending exception after the safepoint
3395 
3396   __ bind(pending);
3397 
3398   RegisterSaver::restore_live_registers(masm);
3399 
3400   // exception pending => remove activation and forward to exception handler
3401 
3402   __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
3403 
3404   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3405   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3406 
3407   // -------------
3408   // make sure all code is generated
3409   masm->flush();
3410 
3411   // return the  blob
3412   // frame_size_words or bytes??
3413   RuntimeStub* rs_blob = RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3414 
3415   AOTCodeCache::store_code_blob(*rs_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3416   return rs_blob;
3417 }
3418 
3419 // Continuation point for throwing of implicit exceptions that are
3420 // not handled in the current activation. Fabricates an exception
3421 // oop and initiates normal exception dispatching in this
3422 // frame. Since we need to preserve callee-saved values (currently
3423 // only for C2, but done for C1 as well) we need a callee-saved oop
3424 // map and therefore have to make these stubs into RuntimeStubs
3425 // rather than BufferBlobs.  If the compiler needs all registers to
3426 // be preserved between the fault point and the exception handler
3427 // then it must assume responsibility for that in
3428 // AbstractCompiler::continuation_for_implicit_null_exception or
3429 // continuation_for_implicit_division_by_zero_exception. All other
3430 // implicit exceptions (e.g., NullPointerException or
3431 // AbstractMethodError on entry) are either at call sites or
3432 // otherwise assume that stack unwinding will be initiated, so
3433 // caller saved registers were assumed volatile in the compiler.
3434 RuntimeStub* SharedRuntime::generate_throw_exception(StubId id, address runtime_entry) {
3435   assert(is_throw_id(id), "expected a throw stub id");
3436 
3437   const char* name = SharedRuntime::stub_name(id);
3438 
3439   // Information about frame layout at time of blocking runtime call.
3440   // Note that we only have to preserve callee-saved registers since
3441   // the compilers are responsible for supplying a continuation point
3442   // if they expect all registers to be preserved.
3443   enum layout {
3444     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3445     rbp_off2,
3446     return_off,
3447     return_off2,
3448     framesize // inclusive of return address
3449   };
3450 
3451   int insts_size = 512;
3452   int locs_size  = 64;
3453 
3454   const char* timer_msg = "SharedRuntime generate_throw_exception";
3455   TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3456 
3457   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3458   if (blob != nullptr) {
3459     return blob->as_runtime_stub();
3460   }
3461 
3462   ResourceMark rm;
3463   CodeBuffer code(name, insts_size, locs_size);
3464   OopMapSet* oop_maps  = new OopMapSet();
3465   MacroAssembler* masm = new MacroAssembler(&code);
3466 
3467   address start = __ pc();
3468 
3469   // This is an inlined and slightly modified version of call_VM
3470   // which has the ability to fetch the return PC out of
3471   // thread-local storage and also sets up last_Java_sp slightly
3472   // differently than the real call_VM
3473 
3474   __ enter(); // required for proper stackwalking of RuntimeStub frame
3475 
3476   assert(is_even(framesize/2), "sp not 16-byte aligned");
3477 
3478   // return address and rbp are already in place
3479   __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3480 
3481   int frame_complete = __ pc() - start;
3482 
3483   // Set up last_Java_sp and last_Java_fp
3484   address the_pc = __ pc();
3485   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3486   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3487 
3488   // Call runtime
3489   __ movptr(c_rarg0, r15_thread);
3490   BLOCK_COMMENT("call runtime_entry");
3491   __ call(RuntimeAddress(runtime_entry));
3492 
3493   // Generate oop map
3494   OopMap* map = new OopMap(framesize, 0);
3495 
3496   oop_maps->add_gc_map(the_pc - start, map);
3497 
3498   __ reset_last_Java_frame(true);
3499 
3500   __ leave(); // required for proper stackwalking of RuntimeStub frame
3501 
3502   // check for pending exceptions
3503 #ifdef ASSERT
3504   Label L;
3505   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3506   __ jcc(Assembler::notEqual, L);
3507   __ should_not_reach_here();
3508   __ bind(L);
3509 #endif // ASSERT
3510   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3511 
3512 
3513   // codeBlob framesize is in words (not VMRegImpl::slot_size)
3514   RuntimeStub* stub =
3515     RuntimeStub::new_runtime_stub(name,
3516                                   &code,
3517                                   frame_complete,
3518                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3519                                   oop_maps, false);
3520   AOTCodeCache::store_code_blob(*stub, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3521 
3522   return stub;
3523 }
3524 
3525 //------------------------------Montgomery multiplication------------------------
3526 //
3527 
3528 #ifndef _WINDOWS
3529 
3530 // Subtract 0:b from carry:a.  Return carry.
3531 static julong
3532 sub(julong a[], julong b[], julong carry, long len) {
3533   long long i = 0, cnt = len;
3534   julong tmp;
3535   asm volatile("clc; "
3536                "0: ; "
3537                "mov (%[b], %[i], 8), %[tmp]; "
3538                "sbb %[tmp], (%[a], %[i], 8); "
3539                "inc %[i]; dec %[cnt]; "
3540                "jne 0b; "
3541                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3542                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3543                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3544                : "memory");
3545   return tmp;
3546 }
3547 
3548 // Multiply (unsigned) Long A by Long B, accumulating the double-
3549 // length result into the accumulator formed of T0, T1, and T2.
3550 #define MACC(A, B, T0, T1, T2)                                  \
3551 do {                                                            \
3552   unsigned long hi, lo;                                         \
3553   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3554            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3555            : "r"(A), "a"(B) : "cc");                            \
3556  } while(0)
3557 
3558 // As above, but add twice the double-length result into the
3559 // accumulator.
3560 #define MACC2(A, B, T0, T1, T2)                                 \
3561 do {                                                            \
3562   unsigned long hi, lo;                                         \
3563   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3564            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3565            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3566            : "r"(A), "a"(B) : "cc");                            \
3567  } while(0)
3568 
3569 #else //_WINDOWS
3570 
3571 static julong
3572 sub(julong a[], julong b[], julong carry, long len) {
3573   long i;
3574   julong tmp;
3575   unsigned char c = 1;
3576   for (i = 0; i < len; i++) {
3577     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3578     a[i] = tmp;
3579   }
3580   c = _addcarry_u64(c, carry, ~0, &tmp);
3581   return tmp;
3582 }
3583 
3584 // Multiply (unsigned) Long A by Long B, accumulating the double-
3585 // length result into the accumulator formed of T0, T1, and T2.
3586 #define MACC(A, B, T0, T1, T2)                          \
3587 do {                                                    \
3588   julong hi, lo;                            \
3589   lo = _umul128(A, B, &hi);                             \
3590   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3591   c = _addcarry_u64(c, hi, T1, &T1);                    \
3592   _addcarry_u64(c, T2, 0, &T2);                         \
3593  } while(0)
3594 
3595 // As above, but add twice the double-length result into the
3596 // accumulator.
3597 #define MACC2(A, B, T0, T1, T2)                         \
3598 do {                                                    \
3599   julong hi, lo;                            \
3600   lo = _umul128(A, B, &hi);                             \
3601   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3602   c = _addcarry_u64(c, hi, T1, &T1);                    \
3603   _addcarry_u64(c, T2, 0, &T2);                         \
3604   c = _addcarry_u64(0, lo, T0, &T0);                    \
3605   c = _addcarry_u64(c, hi, T1, &T1);                    \
3606   _addcarry_u64(c, T2, 0, &T2);                         \
3607  } while(0)
3608 
3609 #endif //_WINDOWS
3610 
3611 // Fast Montgomery multiplication.  The derivation of the algorithm is
3612 // in  A Cryptographic Library for the Motorola DSP56000,
3613 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3614 
3615 static void NOINLINE
3616 montgomery_multiply(julong a[], julong b[], julong n[],
3617                     julong m[], julong inv, int len) {
3618   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3619   int i;
3620 
3621   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3622 
3623   for (i = 0; i < len; i++) {
3624     int j;
3625     for (j = 0; j < i; j++) {
3626       MACC(a[j], b[i-j], t0, t1, t2);
3627       MACC(m[j], n[i-j], t0, t1, t2);
3628     }
3629     MACC(a[i], b[0], t0, t1, t2);
3630     m[i] = t0 * inv;
3631     MACC(m[i], n[0], t0, t1, t2);
3632 
3633     assert(t0 == 0, "broken Montgomery multiply");
3634 
3635     t0 = t1; t1 = t2; t2 = 0;
3636   }
3637 
3638   for (i = len; i < 2*len; i++) {
3639     int j;
3640     for (j = i-len+1; j < len; j++) {
3641       MACC(a[j], b[i-j], t0, t1, t2);
3642       MACC(m[j], n[i-j], t0, t1, t2);
3643     }
3644     m[i-len] = t0;
3645     t0 = t1; t1 = t2; t2 = 0;
3646   }
3647 
3648   while (t0)
3649     t0 = sub(m, n, t0, len);
3650 }
3651 
3652 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3653 // multiplies so it should be up to 25% faster than Montgomery
3654 // multiplication.  However, its loop control is more complex and it
3655 // may actually run slower on some machines.
3656 
3657 static void NOINLINE
3658 montgomery_square(julong a[], julong n[],
3659                   julong m[], julong inv, int len) {
3660   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3661   int i;
3662 
3663   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3664 
3665   for (i = 0; i < len; i++) {
3666     int j;
3667     int end = (i+1)/2;
3668     for (j = 0; j < end; j++) {
3669       MACC2(a[j], a[i-j], t0, t1, t2);
3670       MACC(m[j], n[i-j], t0, t1, t2);
3671     }
3672     if ((i & 1) == 0) {
3673       MACC(a[j], a[j], t0, t1, t2);
3674     }
3675     for (; j < i; j++) {
3676       MACC(m[j], n[i-j], t0, t1, t2);
3677     }
3678     m[i] = t0 * inv;
3679     MACC(m[i], n[0], t0, t1, t2);
3680 
3681     assert(t0 == 0, "broken Montgomery square");
3682 
3683     t0 = t1; t1 = t2; t2 = 0;
3684   }
3685 
3686   for (i = len; i < 2*len; i++) {
3687     int start = i-len+1;
3688     int end = start + (len - start)/2;
3689     int j;
3690     for (j = start; j < end; j++) {
3691       MACC2(a[j], a[i-j], t0, t1, t2);
3692       MACC(m[j], n[i-j], t0, t1, t2);
3693     }
3694     if ((i & 1) == 0) {
3695       MACC(a[j], a[j], t0, t1, t2);
3696     }
3697     for (; j < len; j++) {
3698       MACC(m[j], n[i-j], t0, t1, t2);
3699     }
3700     m[i-len] = t0;
3701     t0 = t1; t1 = t2; t2 = 0;
3702   }
3703 
3704   while (t0)
3705     t0 = sub(m, n, t0, len);
3706 }
3707 
3708 // Swap words in a longword.
3709 static julong swap(julong x) {
3710   return (x << 32) | (x >> 32);
3711 }
3712 
3713 // Copy len longwords from s to d, word-swapping as we go.  The
3714 // destination array is reversed.
3715 static void reverse_words(julong *s, julong *d, int len) {
3716   d += len;
3717   while(len-- > 0) {
3718     d--;
3719     *d = swap(*s);
3720     s++;
3721   }
3722 }
3723 
3724 // The threshold at which squaring is advantageous was determined
3725 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3726 #define MONTGOMERY_SQUARING_THRESHOLD 64
3727 
3728 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3729                                         jint len, jlong inv,
3730                                         jint *m_ints) {
3731   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3732   int longwords = len/2;
3733 
3734   // Make very sure we don't use so much space that the stack might
3735   // overflow.  512 jints corresponds to an 16384-bit integer and
3736   // will use here a total of 8k bytes of stack space.
3737   int divisor = sizeof(julong) * 4;
3738   guarantee(longwords <= 8192 / divisor, "must be");
3739   int total_allocation = longwords * sizeof (julong) * 4;
3740   julong *scratch = (julong *)alloca(total_allocation);
3741 
3742   // Local scratch arrays
3743   julong
3744     *a = scratch + 0 * longwords,
3745     *b = scratch + 1 * longwords,
3746     *n = scratch + 2 * longwords,
3747     *m = scratch + 3 * longwords;
3748 
3749   reverse_words((julong *)a_ints, a, longwords);
3750   reverse_words((julong *)b_ints, b, longwords);
3751   reverse_words((julong *)n_ints, n, longwords);
3752 
3753   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3754 
3755   reverse_words(m, (julong *)m_ints, longwords);
3756 }
3757 
3758 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3759                                       jint len, jlong inv,
3760                                       jint *m_ints) {
3761   assert(len % 2 == 0, "array length in montgomery_square must be even");
3762   int longwords = len/2;
3763 
3764   // Make very sure we don't use so much space that the stack might
3765   // overflow.  512 jints corresponds to an 16384-bit integer and
3766   // will use here a total of 6k bytes of stack space.
3767   int divisor = sizeof(julong) * 3;
3768   guarantee(longwords <= (8192 / divisor), "must be");
3769   int total_allocation = longwords * sizeof (julong) * 3;
3770   julong *scratch = (julong *)alloca(total_allocation);
3771 
3772   // Local scratch arrays
3773   julong
3774     *a = scratch + 0 * longwords,
3775     *n = scratch + 1 * longwords,
3776     *m = scratch + 2 * longwords;
3777 
3778   reverse_words((julong *)a_ints, a, longwords);
3779   reverse_words((julong *)n_ints, n, longwords);
3780 
3781   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3782     ::montgomery_square(a, n, m, (julong)inv, longwords);
3783   } else {
3784     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3785   }
3786 
3787   reverse_words(m, (julong *)m_ints, longwords);
3788 }
3789 
3790 BufferedInlineTypeBlob* SharedRuntime::generate_buffered_inline_type_adapter(const InlineKlass* vk) {
3791   BufferBlob* buf = BufferBlob::create("inline types pack/unpack", 16 * K);
3792   if (buf == nullptr) {
3793     return nullptr;
3794   }
3795   CodeBuffer buffer(buf);
3796   short buffer_locs[20];
3797   buffer.insts()->initialize_shared_locs((relocInfo*)buffer_locs,
3798                                          sizeof(buffer_locs)/sizeof(relocInfo));
3799 
3800   MacroAssembler* masm = new MacroAssembler(&buffer);
3801 
3802   const Array<SigEntry>* sig_vk = vk->extended_sig();
3803   const Array<VMRegPair>* regs = vk->return_regs();
3804 
3805   int pack_fields_jobject_off = __ offset();
3806   // Resolve pre-allocated buffer from JNI handle.
3807   // We cannot do this in generate_call_stub() because it requires GC code to be initialized.
3808   __ movptr(rax, Address(r13, 0));
3809   __ resolve_jobject(rax /* value */,
3810                      r12 /* tmp */);
3811   __ movptr(Address(r13, 0), rax);
3812 
3813   int pack_fields_off = __ offset();
3814 
3815   int j = 1;
3816   for (int i = 0; i < sig_vk->length(); i++) {
3817     BasicType bt = sig_vk->at(i)._bt;
3818     if (bt == T_METADATA) {
3819       continue;
3820     }
3821     if (bt == T_VOID) {
3822       if (sig_vk->at(i-1)._bt == T_LONG ||
3823           sig_vk->at(i-1)._bt == T_DOUBLE) {
3824         j++;
3825       }
3826       continue;
3827     }
3828     int off = sig_vk->at(i)._offset;
3829     assert(off > 0, "offset in object should be positive");
3830     VMRegPair pair = regs->at(j);
3831     VMReg r_1 = pair.first();
3832     Address to(rax, off);
3833     if (bt == T_FLOAT) {
3834       __ movflt(to, r_1->as_XMMRegister());
3835     } else if (bt == T_DOUBLE) {
3836       __ movdbl(to, r_1->as_XMMRegister());
3837     } else {
3838       Register val = r_1->as_Register();
3839       assert_different_registers(to.base(), val, r14, r13, rbx, rscratch1);
3840       if (is_reference_type(bt)) {
3841         // store_heap_oop transitively calls oop_store_at which corrupts to.base(). We need to keep rax valid.
3842         __ mov(rbx, rax);
3843         Address to_with_rbx(rbx, off);
3844         __ store_heap_oop(to_with_rbx, val, r14, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
3845       } else {
3846         __ store_sized_value(to, r_1->as_Register(), type2aelembytes(bt));
3847       }
3848     }
3849     j++;
3850   }
3851   assert(j == regs->length(), "missed a field?");
3852   if (vk->supports_nullable_layouts()) {
3853     // Set the null marker
3854     __ movb(Address(rax, vk->null_marker_offset()), 1);
3855   }
3856   __ ret(0);
3857 
3858   int unpack_fields_off = __ offset();
3859 
3860   Label skip;
3861   Label not_null;
3862   __ testptr(rax, rax);
3863   __ jcc(Assembler::notZero, not_null);
3864 
3865   // Return value is null. Zero all registers because the runtime requires a canonical
3866   // representation of a flat null.
3867   j = 1;
3868   for (int i = 0; i < sig_vk->length(); i++) {
3869     BasicType bt = sig_vk->at(i)._bt;
3870     if (bt == T_METADATA) {
3871       continue;
3872     }
3873     if (bt == T_VOID) {
3874       if (sig_vk->at(i-1)._bt == T_LONG ||
3875           sig_vk->at(i-1)._bt == T_DOUBLE) {
3876         j++;
3877       }
3878       continue;
3879     }
3880 
3881     VMRegPair pair = regs->at(j);
3882     VMReg r_1 = pair.first();
3883     if (r_1->is_XMMRegister()) {
3884       __ xorps(r_1->as_XMMRegister(), r_1->as_XMMRegister());
3885     } else {
3886       __ xorl(r_1->as_Register(), r_1->as_Register());
3887     }
3888     j++;
3889   }
3890   __ jmp(skip);
3891   __ bind(not_null);
3892 
3893   j = 1;
3894   for (int i = 0; i < sig_vk->length(); i++) {
3895     BasicType bt = sig_vk->at(i)._bt;
3896     if (bt == T_METADATA) {
3897       continue;
3898     }
3899     if (bt == T_VOID) {
3900       if (sig_vk->at(i-1)._bt == T_LONG ||
3901           sig_vk->at(i-1)._bt == T_DOUBLE) {
3902         j++;
3903       }
3904       continue;
3905     }
3906     int off = sig_vk->at(i)._offset;
3907     assert(off > 0, "offset in object should be positive");
3908     VMRegPair pair = regs->at(j);
3909     VMReg r_1 = pair.first();
3910     VMReg r_2 = pair.second();
3911     Address from(rax, off);
3912     if (bt == T_FLOAT) {
3913       __ movflt(r_1->as_XMMRegister(), from);
3914     } else if (bt == T_DOUBLE) {
3915       __ movdbl(r_1->as_XMMRegister(), from);
3916     } else if (bt == T_OBJECT || bt == T_ARRAY) {
3917       assert_different_registers(rax, r_1->as_Register());
3918       __ load_heap_oop(r_1->as_Register(), from);
3919     } else {
3920       assert(is_java_primitive(bt), "unexpected basic type");
3921       assert_different_registers(rax, r_1->as_Register());
3922       size_t size_in_bytes = type2aelembytes(bt);
3923       __ load_sized_value(r_1->as_Register(), from, size_in_bytes, bt != T_CHAR && bt != T_BOOLEAN);
3924     }
3925     j++;
3926   }
3927   assert(j == regs->length(), "missed a field?");
3928 
3929   __ bind(skip);
3930   __ ret(0);
3931 
3932   __ flush();
3933 
3934   return BufferedInlineTypeBlob::create(&buffer, pack_fields_off, pack_fields_jobject_off, unpack_fields_off);
3935 }
3936 
3937 #if INCLUDE_JFR
3938 
3939 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
3940 // It returns a jobject handle to the event writer.
3941 // The handle is dereferenced and the return value is the event writer oop.
3942 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
3943   enum layout {
3944     rbp_off,
3945     rbpH_off,
3946     return_off,
3947     return_off2,
3948     framesize // inclusive of return address
3949   };
3950 
3951   const char* name = SharedRuntime::stub_name(StubId::shared_jfr_write_checkpoint_id);
3952   CodeBuffer code(name, 1024, 64);
3953   MacroAssembler* masm = new MacroAssembler(&code);
3954   address start = __ pc();
3955 
3956   __ enter();
3957   address the_pc = __ pc();
3958 
3959   int frame_complete = the_pc - start;
3960 
3961   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3962   __ movptr(c_rarg0, r15_thread);
3963   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
3964   __ reset_last_Java_frame(true);
3965 
3966   // rax is jobject handle result, unpack and process it through a barrier.
3967   __ resolve_global_jobject(rax, c_rarg0);
3968 
3969   __ leave();
3970   __ ret(0);
3971 
3972   OopMapSet* oop_maps = new OopMapSet();
3973   OopMap* map = new OopMap(framesize, 1);
3974   oop_maps->add_gc_map(frame_complete, map);
3975 
3976   RuntimeStub* stub =
3977     RuntimeStub::new_runtime_stub(name,
3978                                   &code,
3979                                   frame_complete,
3980                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3981                                   oop_maps,
3982                                   false);
3983   return stub;
3984 }
3985 
3986 // For c2: call to return a leased buffer.
3987 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
3988   enum layout {
3989     rbp_off,
3990     rbpH_off,
3991     return_off,
3992     return_off2,
3993     framesize // inclusive of return address
3994   };
3995 
3996   const char* name = SharedRuntime::stub_name(StubId::shared_jfr_return_lease_id);
3997   CodeBuffer code(name, 1024, 64);
3998   MacroAssembler* masm = new MacroAssembler(&code);
3999   address start = __ pc();
4000 
4001   __ enter();
4002   address the_pc = __ pc();
4003 
4004   int frame_complete = the_pc - start;
4005 
4006   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
4007   __ movptr(c_rarg0, r15_thread);
4008   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
4009   __ reset_last_Java_frame(true);
4010 
4011   __ leave();
4012   __ ret(0);
4013 
4014   OopMapSet* oop_maps = new OopMapSet();
4015   OopMap* map = new OopMap(framesize, 1);
4016   oop_maps->add_gc_map(frame_complete, map);
4017 
4018   RuntimeStub* stub =
4019     RuntimeStub::new_runtime_stub(name,
4020                                   &code,
4021                                   frame_complete,
4022                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4023                                   oop_maps,
4024                                   false);
4025   return stub;
4026 }
4027 
4028 #endif // INCLUDE_JFR