1 /*
   2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef _WINDOWS
  26 #include "alloca.h"
  27 #endif
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "classfile/symbolTable.hpp"
  31 #include "code/aotCodeCache.hpp"
  32 #include "code/compiledIC.hpp"
  33 #include "code/debugInfoRec.hpp"
  34 #include "code/nativeInst.hpp"
  35 #include "code/vtableStubs.hpp"
  36 #include "compiler/oopMap.hpp"
  37 #include "gc/shared/collectedHeap.hpp"
  38 #include "gc/shared/gcLocker.hpp"
  39 #include "gc/shared/barrierSet.hpp"
  40 #include "gc/shared/barrierSetAssembler.hpp"
  41 #include "interpreter/interpreter.hpp"
  42 #include "logging/log.hpp"
  43 #include "memory/resourceArea.hpp"
  44 #include "memory/universe.hpp"
  45 #include "oops/klass.inline.hpp"
  46 #include "oops/method.inline.hpp"
  47 #include "prims/methodHandles.hpp"
  48 #include "runtime/continuation.hpp"
  49 #include "runtime/continuationEntry.inline.hpp"
  50 #include "runtime/globals.hpp"
  51 #include "runtime/jniHandles.hpp"
  52 #include "runtime/safepointMechanism.hpp"
  53 #include "runtime/sharedRuntime.hpp"
  54 #include "runtime/signature.hpp"
  55 #include "runtime/stubRoutines.hpp"
  56 #include "runtime/timerTrace.hpp"
  57 #include "runtime/vframeArray.hpp"
  58 #include "runtime/vm_version.hpp"
  59 #include "utilities/align.hpp"
  60 #include "utilities/checkedCast.hpp"
  61 #include "utilities/formatBuffer.hpp"
  62 #include "vmreg_x86.inline.hpp"
  63 #ifdef COMPILER1
  64 #include "c1/c1_Runtime1.hpp"
  65 #endif
  66 #ifdef COMPILER2
  67 #include "opto/runtime.hpp"
  68 #endif
  69 #if INCLUDE_JVMCI
  70 #include "jvmci/jvmciJavaClasses.hpp"
  71 #endif
  72 
  73 #define __ masm->
  74 
  75 #ifdef PRODUCT
  76 #define BLOCK_COMMENT(str) /* nothing */
  77 #else
  78 #define BLOCK_COMMENT(str) __ block_comment(str)
  79 #endif // PRODUCT
  80 
  81 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  82 
  83 class RegisterSaver {
  84   // Capture info about frame layout.  Layout offsets are in jint
  85   // units because compiler frame slots are jints.
  86 #define XSAVE_AREA_BEGIN 160
  87 #define XSAVE_AREA_YMM_BEGIN 576
  88 #define XSAVE_AREA_EGPRS 960
  89 #define XSAVE_AREA_OPMASK_BEGIN 1088
  90 #define XSAVE_AREA_ZMM_BEGIN 1152
  91 #define XSAVE_AREA_UPPERBANK 1664
  92 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  93 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  94 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
  95 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
  96 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
  97   enum layout {
  98     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt,    // fxsave save area
  99     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,  // offset in fxsave save area
 100     DEF_XMM_OFFS(0),
 101     DEF_XMM_OFFS(1),
 102     // 2..15 are implied in range usage
 103     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 104     DEF_YMM_OFFS(0),
 105     DEF_YMM_OFFS(1),
 106     r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
 107     r16H_off,
 108     r17_off, r17H_off,
 109     r18_off, r18H_off,
 110     r19_off, r19H_off,
 111     r20_off, r20H_off,
 112     r21_off, r21H_off,
 113     r22_off, r22H_off,
 114     r23_off, r23H_off,
 115     r24_off, r24H_off,
 116     r25_off, r25H_off,
 117     r26_off, r26H_off,
 118     r27_off, r27H_off,
 119     r28_off, r28H_off,
 120     r29_off, r29H_off,
 121     r30_off, r30H_off,
 122     r31_off, r31H_off,
 123     opmask_off   = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 124     DEF_OPMASK_OFFS(0),
 125     DEF_OPMASK_OFFS(1),
 126     // 2..7 are implied in range usage
 127     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 128     DEF_ZMM_OFFS(0),
 129     DEF_ZMM_OFFS(1),
 130     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 131     DEF_ZMM_UPPER_OFFS(16),
 132     DEF_ZMM_UPPER_OFFS(17),
 133     // 18..31 are implied in range usage
 134     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 135     fpu_stateH_end,
 136     r15_off, r15H_off,
 137     r14_off, r14H_off,
 138     r13_off, r13H_off,
 139     r12_off, r12H_off,
 140     r11_off, r11H_off,
 141     r10_off, r10H_off,
 142     r9_off,  r9H_off,
 143     r8_off,  r8H_off,
 144     rdi_off, rdiH_off,
 145     rsi_off, rsiH_off,
 146     ignore_off, ignoreH_off,  // extra copy of rbp
 147     rsp_off, rspH_off,
 148     rbx_off, rbxH_off,
 149     rdx_off, rdxH_off,
 150     rcx_off, rcxH_off,
 151     rax_off, raxH_off,
 152     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 153     align_off, alignH_off,
 154     flags_off, flagsH_off,
 155     // The frame sender code expects that rbp will be in the "natural" place and
 156     // will override any oopMap setting for it. We must therefore force the layout
 157     // so that it agrees with the frame sender code.
 158     rbp_off, rbpH_off,        // copy of rbp we will restore
 159     return_off, returnH_off,  // slot for return address
 160     reg_save_size             // size in compiler stack slots
 161   };
 162 
 163  public:
 164   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 165   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 166 
 167   // Offsets into the register save area
 168   // Used by deoptimization when it is managing result register
 169   // values on its own
 170 
 171   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 172   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 173   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 174   static int r15_offset_in_bytes(void)    { return BytesPerInt * r15_off; }
 175   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 176   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 177 
 178   // During deoptimization only the result registers need to be restored,
 179   // all the other values have already been extracted.
 180   static void restore_result_registers(MacroAssembler* masm);
 181 };
 182 
 183 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 184   int off = 0;
 185   int num_xmm_regs = XMMRegister::available_xmm_registers();
 186 #if COMPILER2_OR_JVMCI
 187   if (save_wide_vectors && UseAVX == 0) {
 188     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 189   }
 190   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 191 #else
 192   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 193 #endif
 194 
 195   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 196   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 197   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 198   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 199   // CodeBlob frame size is in words.
 200   int frame_size_in_words = frame_size_in_bytes / wordSize;
 201   *total_frame_words = frame_size_in_words;
 202 
 203   // Save registers, fpu state, and flags.
 204   // We assume caller has already pushed the return address onto the
 205   // stack, so rsp is 8-byte aligned here.
 206   // We push rpb twice in this sequence because we want the real rbp
 207   // to be under the return like a normal enter.
 208 
 209   __ enter();          // rsp becomes 16-byte aligned here
 210   __ pushf();
 211   // Make sure rsp stays 16-byte aligned
 212   __ subq(rsp, 8);
 213   // Push CPU state in multiple of 16 bytes
 214   __ save_legacy_gprs();
 215   __ push_FPU_state();
 216 
 217 
 218   // push cpu state handles this on EVEX enabled targets
 219   if (save_wide_vectors) {
 220     // Save upper half of YMM registers(0..15)
 221     int base_addr = XSAVE_AREA_YMM_BEGIN;
 222     for (int n = 0; n < 16; n++) {
 223       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 224     }
 225     if (VM_Version::supports_evex()) {
 226       // Save upper half of ZMM registers(0..15)
 227       base_addr = XSAVE_AREA_ZMM_BEGIN;
 228       for (int n = 0; n < 16; n++) {
 229         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 230       }
 231       // Save full ZMM registers(16..num_xmm_regs)
 232       base_addr = XSAVE_AREA_UPPERBANK;
 233       off = 0;
 234       int vector_len = Assembler::AVX_512bit;
 235       for (int n = 16; n < num_xmm_regs; n++) {
 236         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 237       }
 238 #if COMPILER2_OR_JVMCI
 239       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 240       off = 0;
 241       for(int n = 0; n < KRegister::number_of_registers; n++) {
 242         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 243       }
 244 #endif
 245     }
 246   } else {
 247     if (VM_Version::supports_evex()) {
 248       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 249       int base_addr = XSAVE_AREA_UPPERBANK;
 250       off = 0;
 251       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 252       for (int n = 16; n < num_xmm_regs; n++) {
 253         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 254       }
 255 #if COMPILER2_OR_JVMCI
 256       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 257       off = 0;
 258       for(int n = 0; n < KRegister::number_of_registers; n++) {
 259         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 260       }
 261 #endif
 262     }
 263   }
 264 
 265 #if COMPILER2_OR_JVMCI
 266   if (UseAPX) {
 267       int base_addr = XSAVE_AREA_EGPRS;
 268       off = 0;
 269       for (int n = 16; n < Register::number_of_registers; n++) {
 270         __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
 271       }
 272   }
 273 #endif
 274 
 275   __ vzeroupper();
 276   if (frame::arg_reg_save_area_bytes != 0) {
 277     // Allocate argument register save area
 278     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 279   }
 280 
 281   // Set an oopmap for the call site.  This oopmap will map all
 282   // oop-registers and debug-info registers as callee-saved.  This
 283   // will allow deoptimization at this safepoint to find all possible
 284   // debug-info recordings, as well as let GC find all oops.
 285 
 286   OopMapSet *oop_maps = new OopMapSet();
 287   OopMap* map = new OopMap(frame_size_in_slots, 0);
 288 
 289 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 290 
 291   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 292   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 293   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 294   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 295   // rbp location is known implicitly by the frame sender code, needs no oopmap
 296   // and the location where rbp was saved by is ignored
 297   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 298   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 299   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 300   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 301   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 302   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 303   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 304   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 305   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 306   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 307 
 308   if (UseAPX) {
 309     map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
 310     map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
 311     map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
 312     map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
 313     map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
 314     map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
 315     map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
 316     map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
 317     map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
 318     map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
 319     map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
 320     map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
 321     map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
 322     map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
 323     map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
 324     map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
 325   }
 326   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 327   // on EVEX enabled targets, we get it included in the xsave area
 328   off = xmm0_off;
 329   int delta = xmm1_off - off;
 330   for (int n = 0; n < 16; n++) {
 331     XMMRegister xmm_name = as_XMMRegister(n);
 332     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 333     off += delta;
 334   }
 335   if (UseAVX > 2) {
 336     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 337     off = zmm16_off;
 338     delta = zmm17_off - off;
 339     for (int n = 16; n < num_xmm_regs; n++) {
 340       XMMRegister zmm_name = as_XMMRegister(n);
 341       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 342       off += delta;
 343     }
 344   }
 345 
 346 #if COMPILER2_OR_JVMCI
 347   if (save_wide_vectors) {
 348     // Save upper half of YMM registers(0..15)
 349     off = ymm0_off;
 350     delta = ymm1_off - ymm0_off;
 351     for (int n = 0; n < 16; n++) {
 352       XMMRegister ymm_name = as_XMMRegister(n);
 353       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 354       off += delta;
 355     }
 356     if (VM_Version::supports_evex()) {
 357       // Save upper half of ZMM registers(0..15)
 358       off = zmm0_off;
 359       delta = zmm1_off - zmm0_off;
 360       for (int n = 0; n < 16; n++) {
 361         XMMRegister zmm_name = as_XMMRegister(n);
 362         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 363         off += delta;
 364       }
 365     }
 366   }
 367 #endif // COMPILER2_OR_JVMCI
 368 
 369   // %%% These should all be a waste but we'll keep things as they were for now
 370   if (true) {
 371     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 372     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 373     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 374     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 375     // rbp location is known implicitly by the frame sender code, needs no oopmap
 376     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 377     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 378     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 379     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 380     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 381     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 382     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 383     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 384     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 385     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 386     if (UseAPX) {
 387       map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
 388       map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
 389       map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
 390       map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
 391       map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
 392       map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
 393       map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
 394       map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
 395       map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
 396       map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
 397       map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
 398       map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
 399       map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
 400       map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
 401       map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
 402       map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
 403     }
 404     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 405     // on EVEX enabled targets, we get it included in the xsave area
 406     off = xmm0H_off;
 407     delta = xmm1H_off - off;
 408     for (int n = 0; n < 16; n++) {
 409       XMMRegister xmm_name = as_XMMRegister(n);
 410       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 411       off += delta;
 412     }
 413     if (UseAVX > 2) {
 414       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 415       off = zmm16H_off;
 416       delta = zmm17H_off - off;
 417       for (int n = 16; n < num_xmm_regs; n++) {
 418         XMMRegister zmm_name = as_XMMRegister(n);
 419         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 420         off += delta;
 421       }
 422     }
 423   }
 424 
 425   return map;
 426 }
 427 
 428 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 429   int num_xmm_regs = XMMRegister::available_xmm_registers();
 430   if (frame::arg_reg_save_area_bytes != 0) {
 431     // Pop arg register save area
 432     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 433   }
 434 
 435 #if COMPILER2_OR_JVMCI
 436   if (restore_wide_vectors) {
 437     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 438     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 439   }
 440 #else
 441   assert(!restore_wide_vectors, "vectors are generated only by C2");
 442 #endif
 443 
 444   __ vzeroupper();
 445 
 446   // On EVEX enabled targets everything is handled in pop fpu state
 447   if (restore_wide_vectors) {
 448     // Restore upper half of YMM registers (0..15)
 449     int base_addr = XSAVE_AREA_YMM_BEGIN;
 450     for (int n = 0; n < 16; n++) {
 451       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 452     }
 453     if (VM_Version::supports_evex()) {
 454       // Restore upper half of ZMM registers (0..15)
 455       base_addr = XSAVE_AREA_ZMM_BEGIN;
 456       for (int n = 0; n < 16; n++) {
 457         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 458       }
 459       // Restore full ZMM registers(16..num_xmm_regs)
 460       base_addr = XSAVE_AREA_UPPERBANK;
 461       int vector_len = Assembler::AVX_512bit;
 462       int off = 0;
 463       for (int n = 16; n < num_xmm_regs; n++) {
 464         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 465       }
 466 #if COMPILER2_OR_JVMCI
 467       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 468       off = 0;
 469       for (int n = 0; n < KRegister::number_of_registers; n++) {
 470         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 471       }
 472 #endif
 473     }
 474   } else {
 475     if (VM_Version::supports_evex()) {
 476       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 477       int base_addr = XSAVE_AREA_UPPERBANK;
 478       int off = 0;
 479       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 480       for (int n = 16; n < num_xmm_regs; n++) {
 481         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 482       }
 483 #if COMPILER2_OR_JVMCI
 484       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 485       off = 0;
 486       for (int n = 0; n < KRegister::number_of_registers; n++) {
 487         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 488       }
 489 #endif
 490     }
 491   }
 492 
 493 #if COMPILER2_OR_JVMCI
 494   if (UseAPX) {
 495     int base_addr = XSAVE_AREA_EGPRS;
 496     int off = 0;
 497     for (int n = 16; n < Register::number_of_registers; n++) {
 498       __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
 499     }
 500   }
 501 #endif
 502 
 503   // Recover CPU state
 504   __ pop_FPU_state();
 505   __ restore_legacy_gprs();
 506   __ addq(rsp, 8);
 507   __ popf();
 508   // Get the rbp described implicitly by the calling convention (no oopMap)
 509   __ pop(rbp);
 510 }
 511 
 512 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 513 
 514   // Just restore result register. Only used by deoptimization. By
 515   // now any callee save register that needs to be restored to a c2
 516   // caller of the deoptee has been extracted into the vframeArray
 517   // and will be stuffed into the c2i adapter we create for later
 518   // restoration so only result registers need to be restored here.
 519 
 520   // Restore fp result register
 521   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 522   // Restore integer result register
 523   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 524   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 525 
 526   // Pop all of the register save are off the stack except the return address
 527   __ addptr(rsp, return_offset_in_bytes());
 528 }
 529 
 530 // Is vector's size (in bytes) bigger than a size saved by default?
 531 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 532 bool SharedRuntime::is_wide_vector(int size) {
 533   return size > 16;
 534 }
 535 
 536 // ---------------------------------------------------------------------------
 537 // Read the array of BasicTypes from a signature, and compute where the
 538 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 539 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 540 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 541 // as framesizes are fixed.
 542 // VMRegImpl::stack0 refers to the first slot 0(sp).
 543 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 544 // Register up to Register::number_of_registers are the 64-bit
 545 // integer registers.
 546 
 547 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 548 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 549 // units regardless of build. Of course for i486 there is no 64 bit build
 550 
 551 // The Java calling convention is a "shifted" version of the C ABI.
 552 // By skipping the first C ABI register we can call non-static jni methods
 553 // with small numbers of arguments without having to shuffle the arguments
 554 // at all. Since we control the java ABI we ought to at least get some
 555 // advantage out of it.
 556 
 557 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 558                                            VMRegPair *regs,
 559                                            int total_args_passed) {
 560 
 561   // Create the mapping between argument positions and
 562   // registers.
 563   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 564     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 565   };
 566   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 567     j_farg0, j_farg1, j_farg2, j_farg3,
 568     j_farg4, j_farg5, j_farg6, j_farg7
 569   };
 570 
 571 
 572   uint int_args = 0;
 573   uint fp_args = 0;
 574   uint stk_args = 0;
 575 
 576   for (int i = 0; i < total_args_passed; i++) {
 577     switch (sig_bt[i]) {
 578     case T_BOOLEAN:
 579     case T_CHAR:
 580     case T_BYTE:
 581     case T_SHORT:
 582     case T_INT:
 583       if (int_args < Argument::n_int_register_parameters_j) {
 584         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 585       } else {
 586         stk_args = align_up(stk_args, 2);
 587         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 588         stk_args += 1;
 589       }
 590       break;
 591     case T_VOID:
 592       // halves of T_LONG or T_DOUBLE
 593       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 594       regs[i].set_bad();
 595       break;
 596     case T_LONG:
 597       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 598       // fall through
 599     case T_OBJECT:
 600     case T_ARRAY:
 601     case T_ADDRESS:
 602       if (int_args < Argument::n_int_register_parameters_j) {
 603         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 604       } else {
 605         stk_args = align_up(stk_args, 2);
 606         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 607         stk_args += 2;
 608       }
 609       break;
 610     case T_FLOAT:
 611       if (fp_args < Argument::n_float_register_parameters_j) {
 612         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 613       } else {
 614         stk_args = align_up(stk_args, 2);
 615         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 616         stk_args += 1;
 617       }
 618       break;
 619     case T_DOUBLE:
 620       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 621       if (fp_args < Argument::n_float_register_parameters_j) {
 622         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 623       } else {
 624         stk_args = align_up(stk_args, 2);
 625         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 626         stk_args += 2;
 627       }
 628       break;
 629     default:
 630       ShouldNotReachHere();
 631       break;
 632     }
 633   }
 634 
 635   return stk_args;
 636 }
 637 
 638 // Same as java_calling_convention() but for multiple return
 639 // values. There's no way to store them on the stack so if we don't
 640 // have enough registers, multiple values can't be returned.
 641 const uint SharedRuntime::java_return_convention_max_int = Argument::n_int_register_parameters_j+1;
 642 const uint SharedRuntime::java_return_convention_max_float = Argument::n_float_register_parameters_j;
 643 int SharedRuntime::java_return_convention(const BasicType *sig_bt,
 644                                           VMRegPair *regs,
 645                                           int total_args_passed) {
 646   // Create the mapping between argument positions and
 647   // registers.
 648   static const Register INT_ArgReg[java_return_convention_max_int] = {
 649     rax, j_rarg5, j_rarg4, j_rarg3, j_rarg2, j_rarg1, j_rarg0
 650   };
 651   static const XMMRegister FP_ArgReg[java_return_convention_max_float] = {
 652     j_farg0, j_farg1, j_farg2, j_farg3,
 653     j_farg4, j_farg5, j_farg6, j_farg7
 654   };
 655 
 656 
 657   uint int_args = 0;
 658   uint fp_args = 0;
 659 
 660   for (int i = 0; i < total_args_passed; i++) {
 661     switch (sig_bt[i]) {
 662     case T_BOOLEAN:
 663     case T_CHAR:
 664     case T_BYTE:
 665     case T_SHORT:
 666     case T_INT:
 667       if (int_args < Argument::n_int_register_parameters_j+1) {
 668         regs[i].set1(INT_ArgReg[int_args]->as_VMReg());
 669         int_args++;
 670       } else {
 671         return -1;
 672       }
 673       break;
 674     case T_VOID:
 675       // halves of T_LONG or T_DOUBLE
 676       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 677       regs[i].set_bad();
 678       break;
 679     case T_LONG:
 680       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 681       // fall through
 682     case T_OBJECT:
 683     case T_ARRAY:
 684     case T_ADDRESS:
 685     case T_METADATA:
 686       if (int_args < Argument::n_int_register_parameters_j+1) {
 687         regs[i].set2(INT_ArgReg[int_args]->as_VMReg());
 688         int_args++;
 689       } else {
 690         return -1;
 691       }
 692       break;
 693     case T_FLOAT:
 694       if (fp_args < Argument::n_float_register_parameters_j) {
 695         regs[i].set1(FP_ArgReg[fp_args]->as_VMReg());
 696         fp_args++;
 697       } else {
 698         return -1;
 699       }
 700       break;
 701     case T_DOUBLE:
 702       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 703       if (fp_args < Argument::n_float_register_parameters_j) {
 704         regs[i].set2(FP_ArgReg[fp_args]->as_VMReg());
 705         fp_args++;
 706       } else {
 707         return -1;
 708       }
 709       break;
 710     default:
 711       ShouldNotReachHere();
 712       break;
 713     }
 714   }
 715 
 716   return int_args + fp_args;
 717 }
 718 
 719 // Patch the callers callsite with entry to compiled code if it exists.
 720 static void patch_callers_callsite(MacroAssembler *masm) {
 721   Label L;
 722   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 723   __ jcc(Assembler::equal, L);
 724 
 725   // Save the current stack pointer
 726   __ mov(r13, rsp);
 727   // Schedule the branch target address early.
 728   // Call into the VM to patch the caller, then jump to compiled callee
 729   // rax isn't live so capture return address while we easily can
 730   __ movptr(rax, Address(rsp, 0));
 731 
 732   // align stack so push_CPU_state doesn't fault
 733   __ andptr(rsp, -(StackAlignmentInBytes));
 734   __ push_CPU_state();
 735   __ vzeroupper();
 736   // VM needs caller's callsite
 737   // VM needs target method
 738   // This needs to be a long call since we will relocate this adapter to
 739   // the codeBuffer and it may not reach
 740 
 741   // Allocate argument register save area
 742   if (frame::arg_reg_save_area_bytes != 0) {
 743     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 744   }
 745   __ mov(c_rarg0, rbx);
 746   __ mov(c_rarg1, rax);
 747   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 748 
 749   // De-allocate argument register save area
 750   if (frame::arg_reg_save_area_bytes != 0) {
 751     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 752   }
 753 
 754   __ vzeroupper();
 755   __ pop_CPU_state();
 756   // restore sp
 757   __ mov(rsp, r13);
 758   __ bind(L);
 759 }
 760 
 761 // For each inline type argument, sig includes the list of fields of
 762 // the inline type. This utility function computes the number of
 763 // arguments for the call if inline types are passed by reference (the
 764 // calling convention the interpreter expects).
 765 static int compute_total_args_passed_int(const GrowableArray<SigEntry>* sig_extended) {
 766   int total_args_passed = 0;
 767   if (InlineTypePassFieldsAsArgs) {
 768     for (int i = 0; i < sig_extended->length(); i++) {
 769       BasicType bt = sig_extended->at(i)._bt;
 770       if (bt == T_METADATA) {
 771         // In sig_extended, an inline type argument starts with:
 772         // T_METADATA, followed by the types of the fields of the
 773         // inline type and T_VOID to mark the end of the value
 774         // type. Inline types are flattened so, for instance, in the
 775         // case of an inline type with an int field and an inline type
 776         // field that itself has 2 fields, an int and a long:
 777         // T_METADATA T_INT T_METADATA T_INT T_LONG T_VOID (second
 778         // slot for the T_LONG) T_VOID (inner inline type) T_VOID
 779         // (outer inline type)
 780         total_args_passed++;
 781         int vt = 1;
 782         do {
 783           i++;
 784           BasicType bt = sig_extended->at(i)._bt;
 785           BasicType prev_bt = sig_extended->at(i-1)._bt;
 786           if (bt == T_METADATA) {
 787             vt++;
 788           } else if (bt == T_VOID &&
 789                      prev_bt != T_LONG &&
 790                      prev_bt != T_DOUBLE) {
 791             vt--;
 792           }
 793         } while (vt != 0);
 794       } else {
 795         total_args_passed++;
 796       }
 797     }
 798   } else {
 799     total_args_passed = sig_extended->length();
 800   }
 801   return total_args_passed;
 802 }
 803 
 804 
 805 static void gen_c2i_adapter_helper(MacroAssembler* masm,
 806                                    BasicType bt,
 807                                    BasicType prev_bt,
 808                                    size_t size_in_bytes,
 809                                    const VMRegPair& reg_pair,
 810                                    const Address& to,
 811                                    int extraspace,
 812                                    bool is_oop) {
 813   if (bt == T_VOID) {
 814     assert(prev_bt == T_LONG || prev_bt == T_DOUBLE, "missing half");
 815     return;
 816   }
 817 
 818   // Say 4 args:
 819   // i   st_off
 820   // 0   32 T_LONG
 821   // 1   24 T_VOID
 822   // 2   16 T_OBJECT
 823   // 3    8 T_BOOL
 824   // -    0 return address
 825   //
 826   // However to make thing extra confusing. Because we can fit a long/double in
 827   // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 828   // leaves one slot empty and only stores to a single slot. In this case the
 829   // slot that is occupied is the T_VOID slot. See I said it was confusing.
 830 
 831   bool wide = (size_in_bytes == wordSize);
 832   VMReg r_1 = reg_pair.first();
 833   VMReg r_2 = reg_pair.second();
 834   assert(r_2->is_valid() == wide, "invalid size");
 835   if (!r_1->is_valid()) {
 836     assert(!r_2->is_valid(), "must be invalid");
 837     return;
 838   }
 839 
 840   if (!r_1->is_XMMRegister()) {
 841     Register val = rax;
 842     if (r_1->is_stack()) {
 843       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 844       __ load_sized_value(val, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false);
 845     } else {
 846       val = r_1->as_Register();
 847     }
 848     assert_different_registers(to.base(), val, rscratch1);
 849     if (is_oop) {
 850       __ push(r13);
 851       __ push(rbx);
 852       __ store_heap_oop(to, val, rscratch1, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
 853       __ pop(rbx);
 854       __ pop(r13);
 855     } else {
 856       __ store_sized_value(to, val, size_in_bytes);
 857     }
 858   } else {
 859     if (wide) {
 860       __ movdbl(to, r_1->as_XMMRegister());
 861     } else {
 862       __ movflt(to, r_1->as_XMMRegister());
 863     }
 864   }
 865 }
 866 
 867 static void gen_c2i_adapter(MacroAssembler *masm,
 868                             const GrowableArray<SigEntry>* sig_extended,
 869                             const VMRegPair *regs,
 870                             bool requires_clinit_barrier,
 871                             address& c2i_no_clinit_check_entry,
 872                             Label& skip_fixup,
 873                             address start,
 874                             OopMapSet* oop_maps,
 875                             int& frame_complete,
 876                             int& frame_size_in_words,
 877                             bool alloc_inline_receiver) {
 878   if (requires_clinit_barrier && VM_Version::supports_fast_class_init_checks()) {
 879     Label L_skip_barrier;
 880     Register method = rbx;
 881 
 882     { // Bypass the barrier for non-static methods
 883       Register flags = rscratch1;
 884       __ load_unsigned_short(flags, Address(method, Method::access_flags_offset()));
 885       __ testl(flags, JVM_ACC_STATIC);
 886       __ jcc(Assembler::zero, L_skip_barrier); // non-static
 887     }
 888 
 889     Register klass = rscratch1;
 890     __ load_method_holder(klass, method);
 891     __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
 892 
 893     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
 894 
 895     __ bind(L_skip_barrier);
 896     c2i_no_clinit_check_entry = __ pc();
 897   }
 898 
 899   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 900   bs->c2i_entry_barrier(masm);
 901 
 902   // Before we get into the guts of the C2I adapter, see if we should be here
 903   // at all.  We've come from compiled code and are attempting to jump to the
 904   // interpreter, which means the caller made a static call to get here
 905   // (vcalls always get a compiled target if there is one).  Check for a
 906   // compiled target.  If there is one, we need to patch the caller's call.
 907   patch_callers_callsite(masm);
 908 
 909   __ bind(skip_fixup);
 910 
 911   if (InlineTypePassFieldsAsArgs) {
 912     // Is there an inline type argument?
 913     bool has_inline_argument = false;
 914     for (int i = 0; i < sig_extended->length() && !has_inline_argument; i++) {
 915       has_inline_argument = (sig_extended->at(i)._bt == T_METADATA);
 916     }
 917     if (has_inline_argument) {
 918       // There is at least an inline type argument: we're coming from
 919       // compiled code so we have no buffers to back the inline types.
 920       // Allocate the buffers here with a runtime call.
 921       OopMap* map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ false);
 922 
 923       frame_complete = __ offset();
 924 
 925       __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
 926 
 927       __ mov(c_rarg0, r15_thread);
 928       __ mov(c_rarg1, rbx);
 929       __ mov64(c_rarg2, (int64_t)alloc_inline_receiver);
 930       __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::allocate_inline_types)));
 931 
 932       oop_maps->add_gc_map((int)(__ pc() - start), map);
 933       __ reset_last_Java_frame(false);
 934 
 935       RegisterSaver::restore_live_registers(masm);
 936 
 937       Label no_exception;
 938       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
 939       __ jcc(Assembler::equal, no_exception);
 940 
 941       __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
 942       __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 943       __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
 944 
 945       __ bind(no_exception);
 946 
 947       // We get an array of objects from the runtime call
 948       __ get_vm_result_oop(rscratch2); // Use rscratch2 (r11) as temporary because rscratch1 (r10) is trashed by movptr()
 949       __ get_vm_result_metadata(rbx); // TODO: required to keep the callee Method live?
 950     }
 951   }
 952 
 953   // Since all args are passed on the stack, total_args_passed *
 954   // Interpreter::stackElementSize is the space we need.
 955   int total_args_passed = compute_total_args_passed_int(sig_extended);
 956   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 957 
 958   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 959 
 960   // stack is aligned, keep it that way
 961   // This is not currently needed or enforced by the interpreter, but
 962   // we might as well conform to the ABI.
 963   extraspace = align_up(extraspace, 2*wordSize);
 964 
 965   // set senderSP value
 966   __ lea(r13, Address(rsp, wordSize));
 967 
 968 #ifdef ASSERT
 969   __ check_stack_alignment(r13, "sender stack not aligned");
 970 #endif
 971   if (extraspace > 0) {
 972     // Pop the return address
 973     __ pop(rax);
 974 
 975     __ subptr(rsp, extraspace);
 976 
 977     // Push the return address
 978     __ push(rax);
 979 
 980     // Account for the return address location since we store it first rather
 981     // than hold it in a register across all the shuffling
 982     extraspace += wordSize;
 983   }
 984 
 985 #ifdef ASSERT
 986   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 987 #endif
 988 
 989   // Now write the args into the outgoing interpreter space
 990 
 991   // next_arg_comp is the next argument from the compiler point of
 992   // view (inline type fields are passed in registers/on the stack). In
 993   // sig_extended, an inline type argument starts with: T_METADATA,
 994   // followed by the types of the fields of the inline type and T_VOID
 995   // to mark the end of the inline type. ignored counts the number of
 996   // T_METADATA/T_VOID. next_vt_arg is the next inline type argument:
 997   // used to get the buffer for that argument from the pool of buffers
 998   // we allocated above and want to pass to the
 999   // interpreter. next_arg_int is the next argument from the
1000   // interpreter point of view (inline types are passed by reference).
1001   for (int next_arg_comp = 0, ignored = 0, next_vt_arg = 0, next_arg_int = 0;
1002        next_arg_comp < sig_extended->length(); next_arg_comp++) {
1003     assert(ignored <= next_arg_comp, "shouldn't skip over more slots than there are arguments");
1004     assert(next_arg_int <= total_args_passed, "more arguments for the interpreter than expected?");
1005     BasicType bt = sig_extended->at(next_arg_comp)._bt;
1006     int st_off = (total_args_passed - next_arg_int) * Interpreter::stackElementSize;
1007     if (!InlineTypePassFieldsAsArgs || bt != T_METADATA) {
1008       int next_off = st_off - Interpreter::stackElementSize;
1009       const int offset = (bt == T_LONG || bt == T_DOUBLE) ? next_off : st_off;
1010       const VMRegPair reg_pair = regs[next_arg_comp-ignored];
1011       size_t size_in_bytes = reg_pair.second()->is_valid() ? 8 : 4;
1012       gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
1013                              size_in_bytes, reg_pair, Address(rsp, offset), extraspace, false);
1014       next_arg_int++;
1015 #ifdef ASSERT
1016       if (bt == T_LONG || bt == T_DOUBLE) {
1017         // Overwrite the unused slot with known junk
1018         __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
1019         __ movptr(Address(rsp, st_off), rax);
1020       }
1021 #endif /* ASSERT */
1022     } else {
1023       ignored++;
1024       // get the buffer from the just allocated pool of buffers
1025       int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + next_vt_arg * type2aelembytes(T_OBJECT);
1026       __ load_heap_oop(r14, Address(rscratch2, index));
1027       next_vt_arg++; next_arg_int++;
1028       int vt = 1;
1029       // write fields we get from compiled code in registers/stack
1030       // slots to the buffer: we know we are done with that inline type
1031       // argument when we hit the T_VOID that acts as an end of inline
1032       // type delimiter for this inline type. Inline types are flattened
1033       // so we might encounter embedded inline types. Each entry in
1034       // sig_extended contains a field offset in the buffer.
1035       Label L_null;
1036       do {
1037         next_arg_comp++;
1038         BasicType bt = sig_extended->at(next_arg_comp)._bt;
1039         BasicType prev_bt = sig_extended->at(next_arg_comp-1)._bt;
1040         if (bt == T_METADATA) {
1041           vt++;
1042           ignored++;
1043         } else if (bt == T_VOID &&
1044                    prev_bt != T_LONG &&
1045                    prev_bt != T_DOUBLE) {
1046           vt--;
1047           ignored++;
1048         } else {
1049           int off = sig_extended->at(next_arg_comp)._offset;
1050           if (off == -1) {
1051             // Nullable inline type argument, emit null check
1052             VMReg reg = regs[next_arg_comp-ignored].first();
1053             Label L_notNull;
1054             if (reg->is_stack()) {
1055               int ld_off = reg->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
1056               __ testb(Address(rsp, ld_off), 1);
1057             } else {
1058               __ testb(reg->as_Register(), 1);
1059             }
1060             __ jcc(Assembler::notZero, L_notNull);
1061             __ movptr(Address(rsp, st_off), 0);
1062             __ jmp(L_null);
1063             __ bind(L_notNull);
1064             continue;
1065           }
1066           assert(off > 0, "offset in object should be positive");
1067           size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize;
1068           bool is_oop = is_reference_type(bt);
1069           gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
1070                                  size_in_bytes, regs[next_arg_comp-ignored], Address(r14, off), extraspace, is_oop);
1071         }
1072       } while (vt != 0);
1073       // pass the buffer to the interpreter
1074       __ movptr(Address(rsp, st_off), r14);
1075       __ bind(L_null);
1076     }
1077   }
1078 
1079   // Schedule the branch target address early.
1080   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
1081   __ jmp(rcx);
1082 }
1083 
1084 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
1085                                     int comp_args_on_stack,
1086                                     const GrowableArray<SigEntry>* sig,
1087                                     const VMRegPair *regs) {
1088 
1089   // Note: r13 contains the senderSP on entry. We must preserve it since
1090   // we may do a i2c -> c2i transition if we lose a race where compiled
1091   // code goes non-entrant while we get args ready.
1092   // In addition we use r13 to locate all the interpreter args as
1093   // we must align the stack to 16 bytes on an i2c entry else we
1094   // lose alignment we expect in all compiled code and register
1095   // save code can segv when fxsave instructions find improperly
1096   // aligned stack pointer.
1097 
1098   // Adapters can be frameless because they do not require the caller
1099   // to perform additional cleanup work, such as correcting the stack pointer.
1100   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
1101   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
1102   // even if a callee has modified the stack pointer.
1103   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
1104   // routinely repairs its caller's stack pointer (from sender_sp, which is set
1105   // up via the senderSP register).
1106   // In other words, if *either* the caller or callee is interpreted, we can
1107   // get the stack pointer repaired after a call.
1108   // This is why c2i and i2c adapters cannot be indefinitely composed.
1109   // In particular, if a c2i adapter were to somehow call an i2c adapter,
1110   // both caller and callee would be compiled methods, and neither would
1111   // clean up the stack pointer changes performed by the two adapters.
1112   // If this happens, control eventually transfers back to the compiled
1113   // caller, but with an uncorrected stack, causing delayed havoc.
1114 
1115   // Must preserve original SP for loading incoming arguments because
1116   // we need to align the outgoing SP for compiled code.
1117   __ movptr(r11, rsp);
1118 
1119   // Pick up the return address
1120   __ pop(rax);
1121 
1122   // Convert 4-byte c2 stack slots to words.
1123   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
1124 
1125   if (comp_args_on_stack) {
1126     __ subptr(rsp, comp_words_on_stack * wordSize);
1127   }
1128 
1129   // Ensure compiled code always sees stack at proper alignment
1130   __ andptr(rsp, -16);
1131 
1132   // push the return address and misalign the stack that youngest frame always sees
1133   // as far as the placement of the call instruction
1134   __ push(rax);
1135 
1136   // Put saved SP in another register
1137   const Register saved_sp = rax;
1138   __ movptr(saved_sp, r11);
1139 
1140   // Will jump to the compiled code just as if compiled code was doing it.
1141   // Pre-load the register-jump target early, to schedule it better.
1142   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_inline_offset())));
1143 
1144 #if INCLUDE_JVMCI
1145   if (EnableJVMCI) {
1146     // check if this call should be routed towards a specific entry point
1147     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1148     Label no_alternative_target;
1149     __ jcc(Assembler::equal, no_alternative_target);
1150     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
1151     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1152     __ bind(no_alternative_target);
1153   }
1154 #endif // INCLUDE_JVMCI
1155 
1156   int total_args_passed = sig->length();
1157 
1158   // Now generate the shuffle code.  Pick up all register args and move the
1159   // rest through the floating point stack top.
1160   for (int i = 0; i < total_args_passed; i++) {
1161     BasicType bt = sig->at(i)._bt;
1162     if (bt == T_VOID) {
1163       // Longs and doubles are passed in native word order, but misaligned
1164       // in the 32-bit build.
1165       BasicType prev_bt = (i > 0) ? sig->at(i-1)._bt : T_ILLEGAL;
1166       assert(i > 0 && (prev_bt == T_LONG || prev_bt == T_DOUBLE), "missing half");
1167       continue;
1168     }
1169 
1170     // Pick up 0, 1 or 2 words from SP+offset.
1171 
1172     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
1173             "scrambled load targets?");
1174     // Load in argument order going down.
1175     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
1176     // Point to interpreter value (vs. tag)
1177     int next_off = ld_off - Interpreter::stackElementSize;
1178     //
1179     //
1180     //
1181     VMReg r_1 = regs[i].first();
1182     VMReg r_2 = regs[i].second();
1183     if (!r_1->is_valid()) {
1184       assert(!r_2->is_valid(), "");
1185       continue;
1186     }
1187     if (r_1->is_stack()) {
1188       // Convert stack slot to an SP offset (+ wordSize to account for return address )
1189       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
1190 
1191       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
1192       // and if we end up going thru a c2i because of a miss a reasonable value of r13
1193       // will be generated.
1194       if (!r_2->is_valid()) {
1195         // sign extend???
1196         __ movl(r13, Address(saved_sp, ld_off));
1197         __ movptr(Address(rsp, st_off), r13);
1198       } else {
1199         //
1200         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1201         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1202         // So we must adjust where to pick up the data to match the interpreter.
1203         //
1204         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
1205         // are accessed as negative so LSW is at LOW address
1206 
1207         // ld_off is MSW so get LSW
1208         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1209                            next_off : ld_off;
1210         __ movq(r13, Address(saved_sp, offset));
1211         // st_off is LSW (i.e. reg.first())
1212         __ movq(Address(rsp, st_off), r13);
1213       }
1214     } else if (r_1->is_Register()) {  // Register argument
1215       Register r = r_1->as_Register();
1216       assert(r != rax, "must be different");
1217       if (r_2->is_valid()) {
1218         //
1219         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1220         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1221         // So we must adjust where to pick up the data to match the interpreter.
1222 
1223         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1224                            next_off : ld_off;
1225 
1226         // this can be a misaligned move
1227         __ movq(r, Address(saved_sp, offset));
1228       } else {
1229         // sign extend and use a full word?
1230         __ movl(r, Address(saved_sp, ld_off));
1231       }
1232     } else {
1233       if (!r_2->is_valid()) {
1234         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
1235       } else {
1236         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
1237       }
1238     }
1239   }
1240 
1241   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
1242 
1243   // 6243940 We might end up in handle_wrong_method if
1244   // the callee is deoptimized as we race thru here. If that
1245   // happens we don't want to take a safepoint because the
1246   // caller frame will look interpreted and arguments are now
1247   // "compiled" so it is much better to make this transition
1248   // invisible to the stack walking code. Unfortunately if
1249   // we try and find the callee by normal means a safepoint
1250   // is possible. So we stash the desired callee in the thread
1251   // and the vm will find there should this case occur.
1252 
1253   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1254 
1255   // put Method* where a c2i would expect should we end up there
1256   // only needed because of c2 resolve stubs return Method* as a result in
1257   // rax
1258   __ mov(rax, rbx);
1259   __ jmp(r11);
1260 }
1261 
1262 static void gen_inline_cache_check(MacroAssembler *masm, Label& skip_fixup) {
1263   Register data = rax;
1264   __ ic_check(1 /* end_alignment */);
1265   __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1266 
1267   // Method might have been compiled since the call site was patched to
1268   // interpreted if that is the case treat it as a miss so we can get
1269   // the call site corrected.
1270   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1271   __ jcc(Assembler::equal, skip_fixup);
1272   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1273 }
1274 
1275 // ---------------------------------------------------------------
1276 void SharedRuntime::generate_i2c2i_adapters(MacroAssembler* masm,
1277                                             int comp_args_on_stack,
1278                                             const GrowableArray<SigEntry>* sig,
1279                                             const VMRegPair* regs,
1280                                             const GrowableArray<SigEntry>* sig_cc,
1281                                             const VMRegPair* regs_cc,
1282                                             const GrowableArray<SigEntry>* sig_cc_ro,
1283                                             const VMRegPair* regs_cc_ro,
1284                                             AdapterHandlerEntry* handler,
1285                                             AdapterBlob*& new_adapter,
1286                                             bool allocate_code_blob) {
1287   address i2c_entry = __ pc();
1288   gen_i2c_adapter(masm, comp_args_on_stack, sig, regs);
1289 
1290   // -------------------------------------------------------------------------
1291   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1292   // to the interpreter.  The args start out packed in the compiled layout.  They
1293   // need to be unpacked into the interpreter layout.  This will almost always
1294   // require some stack space.  We grow the current (compiled) stack, then repack
1295   // the args.  We  finally end in a jump to the generic interpreter entry point.
1296   // On exit from the interpreter, the interpreter will restore our SP (lest the
1297   // compiled code, which relies solely on SP and not RBP, get sick).
1298 
1299   address c2i_unverified_entry        = __ pc();
1300   address c2i_unverified_inline_entry = __ pc();
1301   Label skip_fixup;
1302 
1303   gen_inline_cache_check(masm, skip_fixup);
1304 
1305   OopMapSet* oop_maps = new OopMapSet();
1306   int frame_complete = CodeOffsets::frame_never_safe;
1307   int frame_size_in_words = 0;
1308 
1309   // Scalarized c2i adapter with non-scalarized receiver (i.e., don't pack receiver)
1310   address c2i_no_clinit_check_entry = nullptr;
1311   address c2i_inline_ro_entry = __ pc();
1312   if (regs_cc != regs_cc_ro) {
1313     // No class init barrier needed because method is guaranteed to be non-static
1314     gen_c2i_adapter(masm, sig_cc_ro, regs_cc_ro, /* requires_clinit_barrier = */ false, c2i_no_clinit_check_entry,
1315                     skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1316     skip_fixup.reset();
1317   }
1318 
1319   // Scalarized c2i adapter
1320   address c2i_entry        = __ pc();
1321   address c2i_inline_entry = __ pc();
1322   gen_c2i_adapter(masm, sig_cc, regs_cc, /* requires_clinit_barrier = */ true, c2i_no_clinit_check_entry,
1323                   skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ true);
1324 
1325   // Non-scalarized c2i adapter
1326   if (regs != regs_cc) {
1327     c2i_unverified_inline_entry = __ pc();
1328     Label inline_entry_skip_fixup;
1329     gen_inline_cache_check(masm, inline_entry_skip_fixup);
1330 
1331     c2i_inline_entry = __ pc();
1332     gen_c2i_adapter(masm, sig, regs, /* requires_clinit_barrier = */ true, c2i_no_clinit_check_entry,
1333                     inline_entry_skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1334   }
1335 
1336   // The c2i adapters might safepoint and trigger a GC. The caller must make sure that
1337   // the GC knows about the location of oop argument locations passed to the c2i adapter.
1338   if (allocate_code_blob) {
1339     bool caller_must_gc_arguments = (regs != regs_cc);
1340     int entry_offset[AdapterHandlerEntry::ENTRIES_COUNT];
1341     assert(AdapterHandlerEntry::ENTRIES_COUNT == 7, "sanity");
1342     entry_offset[0] = 0; // i2c_entry offset
1343     entry_offset[1] = c2i_entry - i2c_entry;
1344     entry_offset[2] = c2i_inline_entry - i2c_entry;
1345     entry_offset[3] = c2i_inline_ro_entry - i2c_entry;
1346     entry_offset[4] = c2i_unverified_entry - i2c_entry;
1347     entry_offset[5] = c2i_unverified_inline_entry - i2c_entry;
1348     entry_offset[6] = c2i_no_clinit_check_entry - i2c_entry;
1349 
1350     new_adapter = AdapterBlob::create(masm->code(), entry_offset, frame_complete, frame_size_in_words, oop_maps, caller_must_gc_arguments);
1351   }
1352 
1353   handler->set_entry_points(i2c_entry, c2i_entry, c2i_inline_entry, c2i_inline_ro_entry, c2i_unverified_entry,
1354                             c2i_unverified_inline_entry, c2i_no_clinit_check_entry);
1355 }
1356 
1357 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1358                                          VMRegPair *regs,
1359                                          int total_args_passed) {
1360 
1361 // We return the amount of VMRegImpl stack slots we need to reserve for all
1362 // the arguments NOT counting out_preserve_stack_slots.
1363 
1364 // NOTE: These arrays will have to change when c1 is ported
1365 #ifdef _WIN64
1366     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1367       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1368     };
1369     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1370       c_farg0, c_farg1, c_farg2, c_farg3
1371     };
1372 #else
1373     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1374       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1375     };
1376     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1377       c_farg0, c_farg1, c_farg2, c_farg3,
1378       c_farg4, c_farg5, c_farg6, c_farg7
1379     };
1380 #endif // _WIN64
1381 
1382 
1383     uint int_args = 0;
1384     uint fp_args = 0;
1385     uint stk_args = 0; // inc by 2 each time
1386 
1387     for (int i = 0; i < total_args_passed; i++) {
1388       switch (sig_bt[i]) {
1389       case T_BOOLEAN:
1390       case T_CHAR:
1391       case T_BYTE:
1392       case T_SHORT:
1393       case T_INT:
1394         if (int_args < Argument::n_int_register_parameters_c) {
1395           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1396 #ifdef _WIN64
1397           fp_args++;
1398           // Allocate slots for callee to stuff register args the stack.
1399           stk_args += 2;
1400 #endif
1401         } else {
1402           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1403           stk_args += 2;
1404         }
1405         break;
1406       case T_LONG:
1407         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1408         // fall through
1409       case T_OBJECT:
1410       case T_ARRAY:
1411       case T_ADDRESS:
1412       case T_METADATA:
1413         if (int_args < Argument::n_int_register_parameters_c) {
1414           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1415 #ifdef _WIN64
1416           fp_args++;
1417           stk_args += 2;
1418 #endif
1419         } else {
1420           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1421           stk_args += 2;
1422         }
1423         break;
1424       case T_FLOAT:
1425         if (fp_args < Argument::n_float_register_parameters_c) {
1426           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1427 #ifdef _WIN64
1428           int_args++;
1429           // Allocate slots for callee to stuff register args the stack.
1430           stk_args += 2;
1431 #endif
1432         } else {
1433           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1434           stk_args += 2;
1435         }
1436         break;
1437       case T_DOUBLE:
1438         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1439         if (fp_args < Argument::n_float_register_parameters_c) {
1440           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1441 #ifdef _WIN64
1442           int_args++;
1443           // Allocate slots for callee to stuff register args the stack.
1444           stk_args += 2;
1445 #endif
1446         } else {
1447           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1448           stk_args += 2;
1449         }
1450         break;
1451       case T_VOID: // Halves of longs and doubles
1452         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1453         regs[i].set_bad();
1454         break;
1455       default:
1456         ShouldNotReachHere();
1457         break;
1458       }
1459     }
1460 #ifdef _WIN64
1461   // windows abi requires that we always allocate enough stack space
1462   // for 4 64bit registers to be stored down.
1463   if (stk_args < 8) {
1464     stk_args = 8;
1465   }
1466 #endif // _WIN64
1467 
1468   return stk_args;
1469 }
1470 
1471 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1472                                              uint num_bits,
1473                                              uint total_args_passed) {
1474   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1475          "only certain vector sizes are supported for now");
1476 
1477   static const XMMRegister VEC_ArgReg[32] = {
1478      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1479      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1480     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1481     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1482   };
1483 
1484   uint stk_args = 0;
1485   uint fp_args = 0;
1486 
1487   for (uint i = 0; i < total_args_passed; i++) {
1488     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1489     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1490     regs[i].set_pair(vmreg->next(next_val), vmreg);
1491   }
1492 
1493   return stk_args;
1494 }
1495 
1496 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1497   // We always ignore the frame_slots arg and just use the space just below frame pointer
1498   // which by this time is free to use
1499   switch (ret_type) {
1500   case T_FLOAT:
1501     __ movflt(Address(rbp, -wordSize), xmm0);
1502     break;
1503   case T_DOUBLE:
1504     __ movdbl(Address(rbp, -wordSize), xmm0);
1505     break;
1506   case T_VOID:  break;
1507   default: {
1508     __ movptr(Address(rbp, -wordSize), rax);
1509     }
1510   }
1511 }
1512 
1513 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1514   // We always ignore the frame_slots arg and just use the space just below frame pointer
1515   // which by this time is free to use
1516   switch (ret_type) {
1517   case T_FLOAT:
1518     __ movflt(xmm0, Address(rbp, -wordSize));
1519     break;
1520   case T_DOUBLE:
1521     __ movdbl(xmm0, Address(rbp, -wordSize));
1522     break;
1523   case T_VOID:  break;
1524   default: {
1525     __ movptr(rax, Address(rbp, -wordSize));
1526     }
1527   }
1528 }
1529 
1530 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1531     for ( int i = first_arg ; i < arg_count ; i++ ) {
1532       if (args[i].first()->is_Register()) {
1533         __ push(args[i].first()->as_Register());
1534       } else if (args[i].first()->is_XMMRegister()) {
1535         __ subptr(rsp, 2*wordSize);
1536         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1537       }
1538     }
1539 }
1540 
1541 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1542     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1543       if (args[i].first()->is_Register()) {
1544         __ pop(args[i].first()->as_Register());
1545       } else if (args[i].first()->is_XMMRegister()) {
1546         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1547         __ addptr(rsp, 2*wordSize);
1548       }
1549     }
1550 }
1551 
1552 static void verify_oop_args(MacroAssembler* masm,
1553                             const methodHandle& method,
1554                             const BasicType* sig_bt,
1555                             const VMRegPair* regs) {
1556   Register temp_reg = rbx;  // not part of any compiled calling seq
1557   if (VerifyOops) {
1558     for (int i = 0; i < method->size_of_parameters(); i++) {
1559       if (is_reference_type(sig_bt[i])) {
1560         VMReg r = regs[i].first();
1561         assert(r->is_valid(), "bad oop arg");
1562         if (r->is_stack()) {
1563           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1564           __ verify_oop(temp_reg);
1565         } else {
1566           __ verify_oop(r->as_Register());
1567         }
1568       }
1569     }
1570   }
1571 }
1572 
1573 static void check_continuation_enter_argument(VMReg actual_vmreg,
1574                                               Register expected_reg,
1575                                               const char* name) {
1576   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1577   assert(actual_vmreg->as_Register() == expected_reg,
1578          "%s is in unexpected register: %s instead of %s",
1579          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1580 }
1581 
1582 
1583 //---------------------------- continuation_enter_setup ---------------------------
1584 //
1585 // Arguments:
1586 //   None.
1587 //
1588 // Results:
1589 //   rsp: pointer to blank ContinuationEntry
1590 //
1591 // Kills:
1592 //   rax
1593 //
1594 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1595   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1596   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1597   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1598 
1599   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1600   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1601 
1602   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1603   OopMap* map = new OopMap(frame_size, 0);
1604 
1605   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1606   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1607   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1608 
1609   return map;
1610 }
1611 
1612 //---------------------------- fill_continuation_entry ---------------------------
1613 //
1614 // Arguments:
1615 //   rsp: pointer to blank Continuation entry
1616 //   reg_cont_obj: pointer to the continuation
1617 //   reg_flags: flags
1618 //
1619 // Results:
1620 //   rsp: pointer to filled out ContinuationEntry
1621 //
1622 // Kills:
1623 //   rax
1624 //
1625 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1626   assert_different_registers(rax, reg_cont_obj, reg_flags);
1627 #ifdef ASSERT
1628   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1629 #endif
1630   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1631   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1632   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1633   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1634   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1635 
1636   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1637   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1638   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1639   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1640 
1641   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1642   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1643 }
1644 
1645 //---------------------------- continuation_enter_cleanup ---------------------------
1646 //
1647 // Arguments:
1648 //   rsp: pointer to the ContinuationEntry
1649 //
1650 // Results:
1651 //   rsp: pointer to the spilled rbp in the entry frame
1652 //
1653 // Kills:
1654 //   rbx
1655 //
1656 static void continuation_enter_cleanup(MacroAssembler* masm) {
1657 #ifdef ASSERT
1658   Label L_good_sp;
1659   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1660   __ jcc(Assembler::equal, L_good_sp);
1661   __ stop("Incorrect rsp at continuation_enter_cleanup");
1662   __ bind(L_good_sp);
1663 #endif
1664   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1665   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1666 
1667   if (CheckJNICalls) {
1668     // Check if this is a virtual thread continuation
1669     Label L_skip_vthread_code;
1670     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1671     __ jcc(Assembler::equal, L_skip_vthread_code);
1672 
1673     // If the held monitor count is > 0 and this vthread is terminating then
1674     // it failed to release a JNI monitor. So we issue the same log message
1675     // that JavaThread::exit does.
1676     __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1677     __ jcc(Assembler::equal, L_skip_vthread_code);
1678 
1679     // rax may hold an exception oop, save it before the call
1680     __ push(rax);
1681     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held));
1682     __ pop(rax);
1683 
1684     // For vthreads we have to explicitly zero the JNI monitor count of the carrier
1685     // on termination. The held count is implicitly zeroed below when we restore from
1686     // the parent held count (which has to be zero).
1687     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1688 
1689     __ bind(L_skip_vthread_code);
1690   }
1691 #ifdef ASSERT
1692   else {
1693     // Check if this is a virtual thread continuation
1694     Label L_skip_vthread_code;
1695     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1696     __ jcc(Assembler::equal, L_skip_vthread_code);
1697 
1698     // See comment just above. If not checking JNI calls the JNI count is only
1699     // needed for assertion checking.
1700     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1701 
1702     __ bind(L_skip_vthread_code);
1703   }
1704 #endif
1705 
1706   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1707   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1708 
1709   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1710   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1711   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1712 }
1713 
1714 static void gen_continuation_enter(MacroAssembler* masm,
1715                                    const VMRegPair* regs,
1716                                    int& exception_offset,
1717                                    OopMapSet* oop_maps,
1718                                    int& frame_complete,
1719                                    int& stack_slots,
1720                                    int& interpreted_entry_offset,
1721                                    int& compiled_entry_offset) {
1722 
1723   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1724   int pos_cont_obj   = 0;
1725   int pos_is_cont    = 1;
1726   int pos_is_virtual = 2;
1727 
1728   // The platform-specific calling convention may present the arguments in various registers.
1729   // To simplify the rest of the code, we expect the arguments to reside at these known
1730   // registers, and we additionally check the placement here in case calling convention ever
1731   // changes.
1732   Register reg_cont_obj   = c_rarg1;
1733   Register reg_is_cont    = c_rarg2;
1734   Register reg_is_virtual = c_rarg3;
1735 
1736   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1737   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1738   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1739 
1740   // Utility methods kill rax, make sure there are no collisions
1741   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1742 
1743   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1744                          relocInfo::static_call_type);
1745 
1746   address start = __ pc();
1747 
1748   Label L_thaw, L_exit;
1749 
1750   // i2i entry used at interp_only_mode only
1751   interpreted_entry_offset = __ pc() - start;
1752   {
1753 #ifdef ASSERT
1754     Label is_interp_only;
1755     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1756     __ jcc(Assembler::notEqual, is_interp_only);
1757     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1758     __ bind(is_interp_only);
1759 #endif
1760 
1761     __ pop(rax); // return address
1762     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1763     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1764     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1765     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1766     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1767     __ push(rax); // return address
1768     __ push_cont_fastpath();
1769 
1770     __ enter();
1771 
1772     stack_slots = 2; // will be adjusted in setup
1773     OopMap* map = continuation_enter_setup(masm, stack_slots);
1774     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1775     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1776 
1777     __ verify_oop(reg_cont_obj);
1778 
1779     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1780 
1781     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1782     __ testptr(reg_is_cont, reg_is_cont);
1783     __ jcc(Assembler::notZero, L_thaw);
1784 
1785     // --- Resolve path
1786 
1787     // Make sure the call is patchable
1788     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1789     // Emit stub for static call
1790     address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1791     if (stub == nullptr) {
1792       fatal("CodeCache is full at gen_continuation_enter");
1793     }
1794     __ call(resolve);
1795     oop_maps->add_gc_map(__ pc() - start, map);
1796     __ post_call_nop();
1797 
1798     __ jmp(L_exit);
1799   }
1800 
1801   // compiled entry
1802   __ align(CodeEntryAlignment);
1803   compiled_entry_offset = __ pc() - start;
1804   __ enter();
1805 
1806   stack_slots = 2; // will be adjusted in setup
1807   OopMap* map = continuation_enter_setup(masm, stack_slots);
1808 
1809   // Frame is now completed as far as size and linkage.
1810   frame_complete = __ pc() - start;
1811 
1812   __ verify_oop(reg_cont_obj);
1813 
1814   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1815 
1816   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1817   __ testptr(reg_is_cont, reg_is_cont);
1818   __ jccb(Assembler::notZero, L_thaw);
1819 
1820   // --- call Continuation.enter(Continuation c, boolean isContinue)
1821 
1822   // Make sure the call is patchable
1823   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1824 
1825   // Emit stub for static call
1826   address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1827   if (stub == nullptr) {
1828     fatal("CodeCache is full at gen_continuation_enter");
1829   }
1830 
1831   // The call needs to be resolved. There's a special case for this in
1832   // SharedRuntime::find_callee_info_helper() which calls
1833   // LinkResolver::resolve_continuation_enter() which resolves the call to
1834   // Continuation.enter(Continuation c, boolean isContinue).
1835   __ call(resolve);
1836 
1837   oop_maps->add_gc_map(__ pc() - start, map);
1838   __ post_call_nop();
1839 
1840   __ jmpb(L_exit);
1841 
1842   // --- Thawing path
1843 
1844   __ bind(L_thaw);
1845 
1846   ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1847   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1848 
1849   ContinuationEntry::_return_pc_offset = __ pc() - start;
1850   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1851   __ post_call_nop();
1852 
1853   // --- Normal exit (resolve/thawing)
1854 
1855   __ bind(L_exit);
1856   ContinuationEntry::_cleanup_offset = __ pc() - start;
1857   continuation_enter_cleanup(masm);
1858   __ pop(rbp);
1859   __ ret(0);
1860 
1861   // --- Exception handling path
1862 
1863   exception_offset = __ pc() - start;
1864 
1865   continuation_enter_cleanup(masm);
1866   __ pop(rbp);
1867 
1868   __ movptr(c_rarg0, r15_thread);
1869   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1870 
1871   // rax still holds the original exception oop, save it before the call
1872   __ push(rax);
1873 
1874   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1875   __ movptr(rbx, rax);
1876 
1877   // Continue at exception handler:
1878   //   rax: exception oop
1879   //   rbx: exception handler
1880   //   rdx: exception pc
1881   __ pop(rax);
1882   __ verify_oop(rax);
1883   __ pop(rdx);
1884   __ jmp(rbx);
1885 }
1886 
1887 static void gen_continuation_yield(MacroAssembler* masm,
1888                                    const VMRegPair* regs,
1889                                    OopMapSet* oop_maps,
1890                                    int& frame_complete,
1891                                    int& stack_slots,
1892                                    int& compiled_entry_offset) {
1893   enum layout {
1894     rbp_off,
1895     rbpH_off,
1896     return_off,
1897     return_off2,
1898     framesize // inclusive of return address
1899   };
1900   stack_slots = framesize /  VMRegImpl::slots_per_word;
1901   assert(stack_slots == 2, "recheck layout");
1902 
1903   address start = __ pc();
1904   compiled_entry_offset = __ pc() - start;
1905   __ enter();
1906   address the_pc = __ pc();
1907 
1908   frame_complete = the_pc - start;
1909 
1910   // This nop must be exactly at the PC we push into the frame info.
1911   // We use this nop for fast CodeBlob lookup, associate the OopMap
1912   // with it right away.
1913   __ post_call_nop();
1914   OopMap* map = new OopMap(framesize, 1);
1915   oop_maps->add_gc_map(frame_complete, map);
1916 
1917   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1918   __ movptr(c_rarg0, r15_thread);
1919   __ movptr(c_rarg1, rsp);
1920   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1921   __ reset_last_Java_frame(true);
1922 
1923   Label L_pinned;
1924 
1925   __ testptr(rax, rax);
1926   __ jcc(Assembler::notZero, L_pinned);
1927 
1928   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1929   continuation_enter_cleanup(masm);
1930   __ pop(rbp);
1931   __ ret(0);
1932 
1933   __ bind(L_pinned);
1934 
1935   // Pinned, return to caller
1936 
1937   // handle pending exception thrown by freeze
1938   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1939   Label ok;
1940   __ jcc(Assembler::equal, ok);
1941   __ leave();
1942   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1943   __ bind(ok);
1944 
1945   __ leave();
1946   __ ret(0);
1947 }
1948 
1949 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1950   ::continuation_enter_cleanup(masm);
1951 }
1952 
1953 static void gen_special_dispatch(MacroAssembler* masm,
1954                                  const methodHandle& method,
1955                                  const BasicType* sig_bt,
1956                                  const VMRegPair* regs) {
1957   verify_oop_args(masm, method, sig_bt, regs);
1958   vmIntrinsics::ID iid = method->intrinsic_id();
1959 
1960   // Now write the args into the outgoing interpreter space
1961   bool     has_receiver   = false;
1962   Register receiver_reg   = noreg;
1963   int      member_arg_pos = -1;
1964   Register member_reg     = noreg;
1965   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1966   if (ref_kind != 0) {
1967     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1968     member_reg = rbx;  // known to be free at this point
1969     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1970   } else if (iid == vmIntrinsics::_invokeBasic) {
1971     has_receiver = true;
1972   } else if (iid == vmIntrinsics::_linkToNative) {
1973     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1974     member_reg = rbx;  // known to be free at this point
1975   } else {
1976     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1977   }
1978 
1979   if (member_reg != noreg) {
1980     // Load the member_arg into register, if necessary.
1981     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1982     VMReg r = regs[member_arg_pos].first();
1983     if (r->is_stack()) {
1984       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1985     } else {
1986       // no data motion is needed
1987       member_reg = r->as_Register();
1988     }
1989   }
1990 
1991   if (has_receiver) {
1992     // Make sure the receiver is loaded into a register.
1993     assert(method->size_of_parameters() > 0, "oob");
1994     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1995     VMReg r = regs[0].first();
1996     assert(r->is_valid(), "bad receiver arg");
1997     if (r->is_stack()) {
1998       // Porting note:  This assumes that compiled calling conventions always
1999       // pass the receiver oop in a register.  If this is not true on some
2000       // platform, pick a temp and load the receiver from stack.
2001       fatal("receiver always in a register");
2002       receiver_reg = j_rarg0;  // known to be free at this point
2003       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
2004     } else {
2005       // no data motion is needed
2006       receiver_reg = r->as_Register();
2007     }
2008   }
2009 
2010   // Figure out which address we are really jumping to:
2011   MethodHandles::generate_method_handle_dispatch(masm, iid,
2012                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
2013 }
2014 
2015 // ---------------------------------------------------------------------------
2016 // Generate a native wrapper for a given method.  The method takes arguments
2017 // in the Java compiled code convention, marshals them to the native
2018 // convention (handlizes oops, etc), transitions to native, makes the call,
2019 // returns to java state (possibly blocking), unhandlizes any result and
2020 // returns.
2021 //
2022 // Critical native functions are a shorthand for the use of
2023 // GetPrimtiveArrayCritical and disallow the use of any other JNI
2024 // functions.  The wrapper is expected to unpack the arguments before
2025 // passing them to the callee. Critical native functions leave the state _in_Java,
2026 // since they cannot stop for GC.
2027 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
2028 // block and the check for pending exceptions it's impossible for them
2029 // to be thrown.
2030 //
2031 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
2032                                                 const methodHandle& method,
2033                                                 int compile_id,
2034                                                 BasicType* in_sig_bt,
2035                                                 VMRegPair* in_regs,
2036                                                 BasicType ret_type) {
2037   if (method->is_continuation_native_intrinsic()) {
2038     int exception_offset = -1;
2039     OopMapSet* oop_maps = new OopMapSet();
2040     int frame_complete = -1;
2041     int stack_slots = -1;
2042     int interpreted_entry_offset = -1;
2043     int vep_offset = -1;
2044     if (method->is_continuation_enter_intrinsic()) {
2045       gen_continuation_enter(masm,
2046                              in_regs,
2047                              exception_offset,
2048                              oop_maps,
2049                              frame_complete,
2050                              stack_slots,
2051                              interpreted_entry_offset,
2052                              vep_offset);
2053     } else if (method->is_continuation_yield_intrinsic()) {
2054       gen_continuation_yield(masm,
2055                              in_regs,
2056                              oop_maps,
2057                              frame_complete,
2058                              stack_slots,
2059                              vep_offset);
2060     } else {
2061       guarantee(false, "Unknown Continuation native intrinsic");
2062     }
2063 
2064 #ifdef ASSERT
2065     if (method->is_continuation_enter_intrinsic()) {
2066       assert(interpreted_entry_offset != -1, "Must be set");
2067       assert(exception_offset != -1,         "Must be set");
2068     } else {
2069       assert(interpreted_entry_offset == -1, "Must be unset");
2070       assert(exception_offset == -1,         "Must be unset");
2071     }
2072     assert(frame_complete != -1,    "Must be set");
2073     assert(stack_slots != -1,       "Must be set");
2074     assert(vep_offset != -1,        "Must be set");
2075 #endif
2076 
2077     __ flush();
2078     nmethod* nm = nmethod::new_native_nmethod(method,
2079                                               compile_id,
2080                                               masm->code(),
2081                                               vep_offset,
2082                                               frame_complete,
2083                                               stack_slots,
2084                                               in_ByteSize(-1),
2085                                               in_ByteSize(-1),
2086                                               oop_maps,
2087                                               exception_offset);
2088     if (nm == nullptr) return nm;
2089     if (method->is_continuation_enter_intrinsic()) {
2090       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
2091     } else if (method->is_continuation_yield_intrinsic()) {
2092       _cont_doYield_stub = nm;
2093     }
2094     return nm;
2095   }
2096 
2097   if (method->is_method_handle_intrinsic()) {
2098     vmIntrinsics::ID iid = method->intrinsic_id();
2099     intptr_t start = (intptr_t)__ pc();
2100     int vep_offset = ((intptr_t)__ pc()) - start;
2101     gen_special_dispatch(masm,
2102                          method,
2103                          in_sig_bt,
2104                          in_regs);
2105     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
2106     __ flush();
2107     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
2108     return nmethod::new_native_nmethod(method,
2109                                        compile_id,
2110                                        masm->code(),
2111                                        vep_offset,
2112                                        frame_complete,
2113                                        stack_slots / VMRegImpl::slots_per_word,
2114                                        in_ByteSize(-1),
2115                                        in_ByteSize(-1),
2116                                        nullptr);
2117   }
2118   address native_func = method->native_function();
2119   assert(native_func != nullptr, "must have function");
2120 
2121   // An OopMap for lock (and class if static)
2122   OopMapSet *oop_maps = new OopMapSet();
2123   intptr_t start = (intptr_t)__ pc();
2124 
2125   // We have received a description of where all the java arg are located
2126   // on entry to the wrapper. We need to convert these args to where
2127   // the jni function will expect them. To figure out where they go
2128   // we convert the java signature to a C signature by inserting
2129   // the hidden arguments as arg[0] and possibly arg[1] (static method)
2130 
2131   const int total_in_args = method->size_of_parameters();
2132   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
2133 
2134   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
2135   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
2136 
2137   int argc = 0;
2138   out_sig_bt[argc++] = T_ADDRESS;
2139   if (method->is_static()) {
2140     out_sig_bt[argc++] = T_OBJECT;
2141   }
2142 
2143   for (int i = 0; i < total_in_args ; i++ ) {
2144     out_sig_bt[argc++] = in_sig_bt[i];
2145   }
2146 
2147   // Now figure out where the args must be stored and how much stack space
2148   // they require.
2149   int out_arg_slots;
2150   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
2151 
2152   // Compute framesize for the wrapper.  We need to handlize all oops in
2153   // incoming registers
2154 
2155   // Calculate the total number of stack slots we will need.
2156 
2157   // First count the abi requirement plus all of the outgoing args
2158   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
2159 
2160   // Now the space for the inbound oop handle area
2161   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
2162 
2163   int oop_handle_offset = stack_slots;
2164   stack_slots += total_save_slots;
2165 
2166   // Now any space we need for handlizing a klass if static method
2167 
2168   int klass_slot_offset = 0;
2169   int klass_offset = -1;
2170   int lock_slot_offset = 0;
2171   bool is_static = false;
2172 
2173   if (method->is_static()) {
2174     klass_slot_offset = stack_slots;
2175     stack_slots += VMRegImpl::slots_per_word;
2176     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
2177     is_static = true;
2178   }
2179 
2180   // Plus a lock if needed
2181 
2182   if (method->is_synchronized()) {
2183     lock_slot_offset = stack_slots;
2184     stack_slots += VMRegImpl::slots_per_word;
2185   }
2186 
2187   // Now a place (+2) to save return values or temp during shuffling
2188   // + 4 for return address (which we own) and saved rbp
2189   stack_slots += 6;
2190 
2191   // Ok The space we have allocated will look like:
2192   //
2193   //
2194   // FP-> |                     |
2195   //      |---------------------|
2196   //      | 2 slots for moves   |
2197   //      |---------------------|
2198   //      | lock box (if sync)  |
2199   //      |---------------------| <- lock_slot_offset
2200   //      | klass (if static)   |
2201   //      |---------------------| <- klass_slot_offset
2202   //      | oopHandle area      |
2203   //      |---------------------| <- oop_handle_offset (6 java arg registers)
2204   //      | outbound memory     |
2205   //      | based arguments     |
2206   //      |                     |
2207   //      |---------------------|
2208   //      |                     |
2209   // SP-> | out_preserved_slots |
2210   //
2211   //
2212 
2213 
2214   // Now compute actual number of stack words we need rounding to make
2215   // stack properly aligned.
2216   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
2217 
2218   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
2219 
2220   // First thing make an ic check to see if we should even be here
2221 
2222   // We are free to use all registers as temps without saving them and
2223   // restoring them except rbp. rbp is the only callee save register
2224   // as far as the interpreter and the compiler(s) are concerned.
2225 
2226   const Register receiver = j_rarg0;
2227 
2228   Label exception_pending;
2229 
2230   assert_different_registers(receiver, rscratch1, rscratch2);
2231   __ verify_oop(receiver);
2232   __ ic_check(8 /* end_alignment */);
2233 
2234   int vep_offset = ((intptr_t)__ pc()) - start;
2235 
2236   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
2237     Label L_skip_barrier;
2238     Register klass = r10;
2239     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
2240     __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
2241 
2242     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
2243 
2244     __ bind(L_skip_barrier);
2245   }
2246 
2247 #ifdef COMPILER1
2248   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2249   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2250     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2251   }
2252 #endif // COMPILER1
2253 
2254   // The instruction at the verified entry point must be 5 bytes or longer
2255   // because it can be patched on the fly by make_non_entrant. The stack bang
2256   // instruction fits that requirement.
2257 
2258   // Generate stack overflow check
2259   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
2260 
2261   // Generate a new frame for the wrapper.
2262   __ enter();
2263   // -2 because return address is already present and so is saved rbp
2264   __ subptr(rsp, stack_size - 2*wordSize);
2265 
2266   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2267   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
2268   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
2269 
2270   // Frame is now completed as far as size and linkage.
2271   int frame_complete = ((intptr_t)__ pc()) - start;
2272 
2273 #ifdef ASSERT
2274   __ check_stack_alignment(rsp, "improperly aligned stack");
2275 #endif /* ASSERT */
2276 
2277 
2278   // We use r14 as the oop handle for the receiver/klass
2279   // It is callee save so it survives the call to native
2280 
2281   const Register oop_handle_reg = r14;
2282 
2283   //
2284   // We immediately shuffle the arguments so that any vm call we have to
2285   // make from here on out (sync slow path, jvmti, etc.) we will have
2286   // captured the oops from our caller and have a valid oopMap for
2287   // them.
2288 
2289   // -----------------
2290   // The Grand Shuffle
2291 
2292   // The Java calling convention is either equal (linux) or denser (win64) than the
2293   // c calling convention. However the because of the jni_env argument the c calling
2294   // convention always has at least one more (and two for static) arguments than Java.
2295   // Therefore if we move the args from java -> c backwards then we will never have
2296   // a register->register conflict and we don't have to build a dependency graph
2297   // and figure out how to break any cycles.
2298   //
2299 
2300   // Record esp-based slot for receiver on stack for non-static methods
2301   int receiver_offset = -1;
2302 
2303   // This is a trick. We double the stack slots so we can claim
2304   // the oops in the caller's frame. Since we are sure to have
2305   // more args than the caller doubling is enough to make
2306   // sure we can capture all the incoming oop args from the
2307   // caller.
2308   //
2309   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2310 
2311   // Mark location of rbp (someday)
2312   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2313 
2314   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2315   // All inbound args are referenced based on rbp and all outbound args via rsp.
2316 
2317 
2318 #ifdef ASSERT
2319   bool reg_destroyed[Register::number_of_registers];
2320   bool freg_destroyed[XMMRegister::number_of_registers];
2321   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2322     reg_destroyed[r] = false;
2323   }
2324   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2325     freg_destroyed[f] = false;
2326   }
2327 
2328 #endif /* ASSERT */
2329 
2330   // For JNI natives the incoming and outgoing registers are offset upwards.
2331   GrowableArray<int> arg_order(2 * total_in_args);
2332 
2333   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2334     arg_order.push(i);
2335     arg_order.push(c_arg);
2336   }
2337 
2338   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2339     int i = arg_order.at(ai);
2340     int c_arg = arg_order.at(ai + 1);
2341     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2342 #ifdef ASSERT
2343     if (in_regs[i].first()->is_Register()) {
2344       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2345     } else if (in_regs[i].first()->is_XMMRegister()) {
2346       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2347     }
2348     if (out_regs[c_arg].first()->is_Register()) {
2349       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2350     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2351       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2352     }
2353 #endif /* ASSERT */
2354     switch (in_sig_bt[i]) {
2355       case T_ARRAY:
2356       case T_OBJECT:
2357         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2358                     ((i == 0) && (!is_static)),
2359                     &receiver_offset);
2360         break;
2361       case T_VOID:
2362         break;
2363 
2364       case T_FLOAT:
2365         __ float_move(in_regs[i], out_regs[c_arg]);
2366           break;
2367 
2368       case T_DOUBLE:
2369         assert( i + 1 < total_in_args &&
2370                 in_sig_bt[i + 1] == T_VOID &&
2371                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2372         __ double_move(in_regs[i], out_regs[c_arg]);
2373         break;
2374 
2375       case T_LONG :
2376         __ long_move(in_regs[i], out_regs[c_arg]);
2377         break;
2378 
2379       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2380 
2381       default:
2382         __ move32_64(in_regs[i], out_regs[c_arg]);
2383     }
2384   }
2385 
2386   int c_arg;
2387 
2388   // Pre-load a static method's oop into r14.  Used both by locking code and
2389   // the normal JNI call code.
2390   // point c_arg at the first arg that is already loaded in case we
2391   // need to spill before we call out
2392   c_arg = total_c_args - total_in_args;
2393 
2394   if (method->is_static()) {
2395 
2396     //  load oop into a register
2397     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2398 
2399     // Now handlize the static class mirror it's known not-null.
2400     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2401     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2402 
2403     // Now get the handle
2404     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2405     // store the klass handle as second argument
2406     __ movptr(c_rarg1, oop_handle_reg);
2407     // and protect the arg if we must spill
2408     c_arg--;
2409   }
2410 
2411   // Change state to native (we save the return address in the thread, since it might not
2412   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2413   // points into the right code segment. It does not have to be the correct return pc.
2414   // We use the same pc/oopMap repeatedly when we call out
2415 
2416   Label native_return;
2417   if (method->is_object_wait0()) {
2418     // For convenience we use the pc we want to resume to in case of preemption on Object.wait.
2419     __ set_last_Java_frame(rsp, noreg, native_return, rscratch1);
2420   } else {
2421     intptr_t the_pc = (intptr_t) __ pc();
2422     oop_maps->add_gc_map(the_pc - start, map);
2423 
2424     __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1);
2425   }
2426 
2427   // We have all of the arguments setup at this point. We must not touch any register
2428   // argument registers at this point (what if we save/restore them there are no oop?
2429 
2430   if (DTraceMethodProbes) {
2431     // protect the args we've loaded
2432     save_args(masm, total_c_args, c_arg, out_regs);
2433     __ mov_metadata(c_rarg1, method());
2434     __ call_VM_leaf(
2435       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2436       r15_thread, c_rarg1);
2437     restore_args(masm, total_c_args, c_arg, out_regs);
2438   }
2439 
2440   // RedefineClasses() tracing support for obsolete method entry
2441   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2442     // protect the args we've loaded
2443     save_args(masm, total_c_args, c_arg, out_regs);
2444     __ mov_metadata(c_rarg1, method());
2445     __ call_VM_leaf(
2446       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2447       r15_thread, c_rarg1);
2448     restore_args(masm, total_c_args, c_arg, out_regs);
2449   }
2450 
2451   // Lock a synchronized method
2452 
2453   // Register definitions used by locking and unlocking
2454 
2455   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2456   const Register obj_reg  = rbx;  // Will contain the oop
2457   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2458 
2459   Label slow_path_lock;
2460   Label lock_done;
2461 
2462   if (method->is_synchronized()) {
2463     // Get the handle (the 2nd argument)
2464     __ mov(oop_handle_reg, c_rarg1);
2465 
2466     // Get address of the box
2467 
2468     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2469 
2470     // Load the oop from the handle
2471     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2472 
2473     __ lightweight_lock(lock_reg, obj_reg, swap_reg, rscratch1, slow_path_lock);
2474 
2475     // Slow path will re-enter here
2476     __ bind(lock_done);
2477   }
2478 
2479   // Finally just about ready to make the JNI call
2480 
2481   // get JNIEnv* which is first argument to native
2482   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2483 
2484   // Now set thread in native
2485   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2486 
2487   __ call(RuntimeAddress(native_func));
2488 
2489   // Verify or restore cpu control state after JNI call
2490   __ restore_cpu_control_state_after_jni(rscratch1);
2491 
2492   // Unpack native results.
2493   switch (ret_type) {
2494   case T_BOOLEAN: __ c2bool(rax);            break;
2495   case T_CHAR   : __ movzwl(rax, rax);      break;
2496   case T_BYTE   : __ sign_extend_byte (rax); break;
2497   case T_SHORT  : __ sign_extend_short(rax); break;
2498   case T_INT    : /* nothing to do */        break;
2499   case T_DOUBLE :
2500   case T_FLOAT  :
2501     // Result is in xmm0 we'll save as needed
2502     break;
2503   case T_ARRAY:                 // Really a handle
2504   case T_OBJECT:                // Really a handle
2505       break; // can't de-handlize until after safepoint check
2506   case T_VOID: break;
2507   case T_LONG: break;
2508   default       : ShouldNotReachHere();
2509   }
2510 
2511   // Switch thread to "native transition" state before reading the synchronization state.
2512   // This additional state is necessary because reading and testing the synchronization
2513   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2514   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2515   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2516   //     Thread A is resumed to finish this native method, but doesn't block here since it
2517   //     didn't see any synchronization is progress, and escapes.
2518   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2519 
2520   // Force this write out before the read below
2521   if (!UseSystemMemoryBarrier) {
2522     __ membar(Assembler::Membar_mask_bits(
2523               Assembler::LoadLoad | Assembler::LoadStore |
2524               Assembler::StoreLoad | Assembler::StoreStore));
2525   }
2526 
2527   // check for safepoint operation in progress and/or pending suspend requests
2528   {
2529     Label Continue;
2530     Label slow_path;
2531 
2532     __ safepoint_poll(slow_path, true /* at_return */, false /* in_nmethod */);
2533 
2534     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2535     __ jcc(Assembler::equal, Continue);
2536     __ bind(slow_path);
2537 
2538     // Don't use call_VM as it will see a possible pending exception and forward it
2539     // and never return here preventing us from clearing _last_native_pc down below.
2540     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2541     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2542     // by hand.
2543     //
2544     __ vzeroupper();
2545     save_native_result(masm, ret_type, stack_slots);
2546     __ mov(c_rarg0, r15_thread);
2547     __ mov(r12, rsp); // remember sp
2548     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2549     __ andptr(rsp, -16); // align stack as required by ABI
2550     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2551     __ mov(rsp, r12); // restore sp
2552     __ reinit_heapbase();
2553     // Restore any method result value
2554     restore_native_result(masm, ret_type, stack_slots);
2555     __ bind(Continue);
2556   }
2557 
2558   // change thread state
2559   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2560 
2561   if (method->is_object_wait0()) {
2562     // Check preemption for Object.wait()
2563     __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset()));
2564     __ cmpptr(rscratch1, NULL_WORD);
2565     __ jccb(Assembler::equal, native_return);
2566     __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD);
2567     __ jmp(rscratch1);
2568     __ bind(native_return);
2569 
2570     intptr_t the_pc = (intptr_t) __ pc();
2571     oop_maps->add_gc_map(the_pc - start, map);
2572   }
2573 
2574 
2575   Label reguard;
2576   Label reguard_done;
2577   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2578   __ jcc(Assembler::equal, reguard);
2579   __ bind(reguard_done);
2580 
2581   // native result if any is live
2582 
2583   // Unlock
2584   Label slow_path_unlock;
2585   Label unlock_done;
2586   if (method->is_synchronized()) {
2587 
2588     Label fast_done;
2589 
2590     // Get locked oop from the handle we passed to jni
2591     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2592 
2593     // Must save rax if it is live now because cmpxchg must use it
2594     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2595       save_native_result(masm, ret_type, stack_slots);
2596     }
2597 
2598     __ lightweight_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2599 
2600     // slow path re-enters here
2601     __ bind(unlock_done);
2602     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2603       restore_native_result(masm, ret_type, stack_slots);
2604     }
2605 
2606     __ bind(fast_done);
2607   }
2608   if (DTraceMethodProbes) {
2609     save_native_result(masm, ret_type, stack_slots);
2610     __ mov_metadata(c_rarg1, method());
2611     __ call_VM_leaf(
2612          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2613          r15_thread, c_rarg1);
2614     restore_native_result(masm, ret_type, stack_slots);
2615   }
2616 
2617   __ reset_last_Java_frame(false);
2618 
2619   // Unbox oop result, e.g. JNIHandles::resolve value.
2620   if (is_reference_type(ret_type)) {
2621     __ resolve_jobject(rax /* value */,
2622                        rcx /* tmp */);
2623   }
2624 
2625   if (CheckJNICalls) {
2626     // clear_pending_jni_exception_check
2627     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2628   }
2629 
2630   // reset handle block
2631   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2632   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2633 
2634   // pop our frame
2635 
2636   __ leave();
2637 
2638 #if INCLUDE_JFR
2639   // We need to do a poll test after unwind in case the sampler
2640   // managed to sample the native frame after returning to Java.
2641   Label L_return;
2642   address poll_test_pc = __ pc();
2643   __ relocate(relocInfo::poll_return_type);
2644   __ testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2645   __ jccb(Assembler::zero, L_return);
2646   __ lea(rscratch1, InternalAddress(poll_test_pc));
2647   __ movptr(Address(r15_thread, JavaThread::saved_exception_pc_offset()), rscratch1);
2648   assert(SharedRuntime::polling_page_return_handler_blob() != nullptr,
2649     "polling page return stub not created yet");
2650   address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
2651   __ jump(RuntimeAddress(stub));
2652   __ bind(L_return);
2653 #endif // INCLUDE_JFR
2654 
2655   // Any exception pending?
2656   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2657   __ jcc(Assembler::notEqual, exception_pending);
2658 
2659   // Return
2660 
2661   __ ret(0);
2662 
2663   // Unexpected paths are out of line and go here
2664 
2665   // forward the exception
2666   __ bind(exception_pending);
2667 
2668   // and forward the exception
2669   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2670 
2671   // Slow path locking & unlocking
2672   if (method->is_synchronized()) {
2673 
2674     // BEGIN Slow path lock
2675     __ bind(slow_path_lock);
2676 
2677     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2678     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2679 
2680     // protect the args we've loaded
2681     save_args(masm, total_c_args, c_arg, out_regs);
2682 
2683     __ mov(c_rarg0, obj_reg);
2684     __ mov(c_rarg1, lock_reg);
2685     __ mov(c_rarg2, r15_thread);
2686 
2687     // Not a leaf but we have last_Java_frame setup as we want.
2688     // We don't want to unmount in case of contention since that would complicate preserving
2689     // the arguments that had already been marshalled into the native convention. So we force
2690     // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame())
2691     // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack.
2692     __ push_cont_fastpath();
2693     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2694     __ pop_cont_fastpath();
2695     restore_args(masm, total_c_args, c_arg, out_regs);
2696 
2697 #ifdef ASSERT
2698     { Label L;
2699     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2700     __ jcc(Assembler::equal, L);
2701     __ stop("no pending exception allowed on exit from monitorenter");
2702     __ bind(L);
2703     }
2704 #endif
2705     __ jmp(lock_done);
2706 
2707     // END Slow path lock
2708 
2709     // BEGIN Slow path unlock
2710     __ bind(slow_path_unlock);
2711 
2712     // If we haven't already saved the native result we must save it now as xmm registers
2713     // are still exposed.
2714     __ vzeroupper();
2715     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2716       save_native_result(masm, ret_type, stack_slots);
2717     }
2718 
2719     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2720 
2721     __ mov(c_rarg0, obj_reg);
2722     __ mov(c_rarg2, r15_thread);
2723     __ mov(r12, rsp); // remember sp
2724     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2725     __ andptr(rsp, -16); // align stack as required by ABI
2726 
2727     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2728     // NOTE that obj_reg == rbx currently
2729     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2730     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2731 
2732     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2733     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2734     __ mov(rsp, r12); // restore sp
2735     __ reinit_heapbase();
2736 #ifdef ASSERT
2737     {
2738       Label L;
2739       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2740       __ jcc(Assembler::equal, L);
2741       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2742       __ bind(L);
2743     }
2744 #endif /* ASSERT */
2745 
2746     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2747 
2748     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2749       restore_native_result(masm, ret_type, stack_slots);
2750     }
2751     __ jmp(unlock_done);
2752 
2753     // END Slow path unlock
2754 
2755   } // synchronized
2756 
2757   // SLOW PATH Reguard the stack if needed
2758 
2759   __ bind(reguard);
2760   __ vzeroupper();
2761   save_native_result(masm, ret_type, stack_slots);
2762   __ mov(r12, rsp); // remember sp
2763   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2764   __ andptr(rsp, -16); // align stack as required by ABI
2765   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2766   __ mov(rsp, r12); // restore sp
2767   __ reinit_heapbase();
2768   restore_native_result(masm, ret_type, stack_slots);
2769   // and continue
2770   __ jmp(reguard_done);
2771 
2772 
2773 
2774   __ flush();
2775 
2776   nmethod *nm = nmethod::new_native_nmethod(method,
2777                                             compile_id,
2778                                             masm->code(),
2779                                             vep_offset,
2780                                             frame_complete,
2781                                             stack_slots / VMRegImpl::slots_per_word,
2782                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2783                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2784                                             oop_maps);
2785 
2786   return nm;
2787 }
2788 
2789 // this function returns the adjust size (in number of words) to a c2i adapter
2790 // activation for use during deoptimization
2791 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2792   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2793 }
2794 
2795 
2796 uint SharedRuntime::out_preserve_stack_slots() {
2797   return 0;
2798 }
2799 
2800 
2801 // Number of stack slots between incoming argument block and the start of
2802 // a new frame.  The PROLOG must add this many slots to the stack.  The
2803 // EPILOG must remove this many slots.  amd64 needs two slots for
2804 // return address.
2805 uint SharedRuntime::in_preserve_stack_slots() {
2806   return 4 + 2 * VerifyStackAtCalls;
2807 }
2808 
2809 VMReg SharedRuntime::thread_register() {
2810   return r15_thread->as_VMReg();
2811 }
2812 
2813 //------------------------------generate_deopt_blob----------------------------
2814 void SharedRuntime::generate_deopt_blob() {
2815   // Allocate space for the code
2816   ResourceMark rm;
2817   // Setup code generation tools
2818   int pad = 0;
2819   if (UseAVX > 2) {
2820     pad += 1024;
2821   }
2822   if (UseAPX) {
2823     pad += 1024;
2824   }
2825 #if INCLUDE_JVMCI
2826   if (EnableJVMCI) {
2827     pad += 512; // Increase the buffer size when compiling for JVMCI
2828   }
2829 #endif
2830   const char* name = SharedRuntime::stub_name(StubId::shared_deopt_id);
2831   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
2832   if (blob != nullptr) {
2833     _deopt_blob = blob->as_deoptimization_blob();
2834     return;
2835   }
2836 
2837   CodeBuffer buffer(name, 2560+pad, 1024);
2838   MacroAssembler* masm = new MacroAssembler(&buffer);
2839   int frame_size_in_words;
2840   OopMap* map = nullptr;
2841   OopMapSet *oop_maps = new OopMapSet();
2842 
2843   // -------------
2844   // This code enters when returning to a de-optimized nmethod.  A return
2845   // address has been pushed on the stack, and return values are in
2846   // registers.
2847   // If we are doing a normal deopt then we were called from the patched
2848   // nmethod from the point we returned to the nmethod. So the return
2849   // address on the stack is wrong by NativeCall::instruction_size
2850   // We will adjust the value so it looks like we have the original return
2851   // address on the stack (like when we eagerly deoptimized).
2852   // In the case of an exception pending when deoptimizing, we enter
2853   // with a return address on the stack that points after the call we patched
2854   // into the exception handler. We have the following register state from,
2855   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2856   //    rax: exception oop
2857   //    rbx: exception handler
2858   //    rdx: throwing pc
2859   // So in this case we simply jam rdx into the useless return address and
2860   // the stack looks just like we want.
2861   //
2862   // At this point we need to de-opt.  We save the argument return
2863   // registers.  We call the first C routine, fetch_unroll_info().  This
2864   // routine captures the return values and returns a structure which
2865   // describes the current frame size and the sizes of all replacement frames.
2866   // The current frame is compiled code and may contain many inlined
2867   // functions, each with their own JVM state.  We pop the current frame, then
2868   // push all the new frames.  Then we call the C routine unpack_frames() to
2869   // populate these frames.  Finally unpack_frames() returns us the new target
2870   // address.  Notice that callee-save registers are BLOWN here; they have
2871   // already been captured in the vframeArray at the time the return PC was
2872   // patched.
2873   address start = __ pc();
2874   Label cont;
2875 
2876   // Prolog for non exception case!
2877 
2878   // Save everything in sight.
2879   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2880 
2881   // Normal deoptimization.  Save exec mode for unpack_frames.
2882   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2883   __ jmp(cont);
2884 
2885   int reexecute_offset = __ pc() - start;
2886 #if INCLUDE_JVMCI && !defined(COMPILER1)
2887   if (UseJVMCICompiler) {
2888     // JVMCI does not use this kind of deoptimization
2889     __ should_not_reach_here();
2890   }
2891 #endif
2892 
2893   // Reexecute case
2894   // return address is the pc describes what bci to do re-execute at
2895 
2896   // No need to update map as each call to save_live_registers will produce identical oopmap
2897   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2898 
2899   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2900   __ jmp(cont);
2901 
2902 #if INCLUDE_JVMCI
2903   Label after_fetch_unroll_info_call;
2904   int implicit_exception_uncommon_trap_offset = 0;
2905   int uncommon_trap_offset = 0;
2906 
2907   if (EnableJVMCI) {
2908     implicit_exception_uncommon_trap_offset = __ pc() - start;
2909 
2910     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2911     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2912 
2913     uncommon_trap_offset = __ pc() - start;
2914 
2915     // Save everything in sight.
2916     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2917     // fetch_unroll_info needs to call last_java_frame()
2918     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2919 
2920     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2921     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2922 
2923     __ movl(r14, Deoptimization::Unpack_reexecute);
2924     __ mov(c_rarg0, r15_thread);
2925     __ movl(c_rarg2, r14); // exec mode
2926     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2927     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2928 
2929     __ reset_last_Java_frame(false);
2930 
2931     __ jmp(after_fetch_unroll_info_call);
2932   } // EnableJVMCI
2933 #endif // INCLUDE_JVMCI
2934 
2935   int exception_offset = __ pc() - start;
2936 
2937   // Prolog for exception case
2938 
2939   // all registers are dead at this entry point, except for rax, and
2940   // rdx which contain the exception oop and exception pc
2941   // respectively.  Set them in TLS and fall thru to the
2942   // unpack_with_exception_in_tls entry point.
2943 
2944   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2945   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2946 
2947   int exception_in_tls_offset = __ pc() - start;
2948 
2949   // new implementation because exception oop is now passed in JavaThread
2950 
2951   // Prolog for exception case
2952   // All registers must be preserved because they might be used by LinearScan
2953   // Exceptiop oop and throwing PC are passed in JavaThread
2954   // tos: stack at point of call to method that threw the exception (i.e. only
2955   // args are on the stack, no return address)
2956 
2957   // make room on stack for the return address
2958   // It will be patched later with the throwing pc. The correct value is not
2959   // available now because loading it from memory would destroy registers.
2960   __ push(0);
2961 
2962   // Save everything in sight.
2963   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2964 
2965   // Now it is safe to overwrite any register
2966 
2967   // Deopt during an exception.  Save exec mode for unpack_frames.
2968   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2969 
2970   // load throwing pc from JavaThread and patch it as the return address
2971   // of the current frame. Then clear the field in JavaThread
2972 
2973   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2974   __ movptr(Address(rbp, wordSize), rdx);
2975   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2976 
2977 #ifdef ASSERT
2978   // verify that there is really an exception oop in JavaThread
2979   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2980   __ verify_oop(rax);
2981 
2982   // verify that there is no pending exception
2983   Label no_pending_exception;
2984   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2985   __ testptr(rax, rax);
2986   __ jcc(Assembler::zero, no_pending_exception);
2987   __ stop("must not have pending exception here");
2988   __ bind(no_pending_exception);
2989 #endif
2990 
2991   __ bind(cont);
2992 
2993   // Call C code.  Need thread and this frame, but NOT official VM entry
2994   // crud.  We cannot block on this call, no GC can happen.
2995   //
2996   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2997 
2998   // fetch_unroll_info needs to call last_java_frame().
2999 
3000   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3001 #ifdef ASSERT
3002   { Label L;
3003     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3004     __ jcc(Assembler::equal, L);
3005     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
3006     __ bind(L);
3007   }
3008 #endif // ASSERT
3009   __ mov(c_rarg0, r15_thread);
3010   __ movl(c_rarg1, r14); // exec_mode
3011   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
3012 
3013   // Need to have an oopmap that tells fetch_unroll_info where to
3014   // find any register it might need.
3015   oop_maps->add_gc_map(__ pc() - start, map);
3016 
3017   __ reset_last_Java_frame(false);
3018 
3019 #if INCLUDE_JVMCI
3020   if (EnableJVMCI) {
3021     __ bind(after_fetch_unroll_info_call);
3022   }
3023 #endif
3024 
3025   // Load UnrollBlock* into rdi
3026   __ mov(rdi, rax);
3027 
3028   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
3029    Label noException;
3030   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
3031   __ jcc(Assembler::notEqual, noException);
3032   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3033   // QQQ this is useless it was null above
3034   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3035   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3036   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3037 
3038   __ verify_oop(rax);
3039 
3040   // Overwrite the result registers with the exception results.
3041   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3042   // I think this is useless
3043   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
3044 
3045   __ bind(noException);
3046 
3047   // Only register save data is on the stack.
3048   // Now restore the result registers.  Everything else is either dead
3049   // or captured in the vframeArray.
3050   RegisterSaver::restore_result_registers(masm);
3051 
3052   // All of the register save area has been popped of the stack. Only the
3053   // return address remains.
3054 
3055   // Pop all the frames we must move/replace.
3056   //
3057   // Frame picture (youngest to oldest)
3058   // 1: self-frame (no frame link)
3059   // 2: deopting frame  (no frame link)
3060   // 3: caller of deopting frame (could be compiled/interpreted).
3061   //
3062   // Note: by leaving the return address of self-frame on the stack
3063   // and using the size of frame 2 to adjust the stack
3064   // when we are done the return to frame 3 will still be on the stack.
3065 
3066   // Pop deoptimized frame
3067   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
3068   __ addptr(rsp, rcx);
3069 
3070   // rsp should be pointing at the return address to the caller (3)
3071 
3072   // Pick up the initial fp we should save
3073   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3074   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
3075 
3076 #ifdef ASSERT
3077   // Compilers generate code that bang the stack by as much as the
3078   // interpreter would need. So this stack banging should never
3079   // trigger a fault. Verify that it does not on non product builds.
3080   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
3081   __ bang_stack_size(rbx, rcx);
3082 #endif
3083 
3084   // Load address of array of frame pcs into rcx
3085   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
3086 
3087   // Trash the old pc
3088   __ addptr(rsp, wordSize);
3089 
3090   // Load address of array of frame sizes into rsi
3091   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
3092 
3093   // Load counter into rdx
3094   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
3095 
3096   // Now adjust the caller's stack to make up for the extra locals
3097   // but record the original sp so that we can save it in the skeletal interpreter
3098   // frame and the stack walking of interpreter_sender will get the unextended sp
3099   // value and not the "real" sp value.
3100 
3101   const Register sender_sp = r8;
3102 
3103   __ mov(sender_sp, rsp);
3104   __ movl(rbx, Address(rdi,
3105                        Deoptimization::UnrollBlock::
3106                        caller_adjustment_offset()));
3107   __ subptr(rsp, rbx);
3108 
3109   // Push interpreter frames in a loop
3110   Label loop;
3111   __ bind(loop);
3112   __ movptr(rbx, Address(rsi, 0));      // Load frame size
3113   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
3114   __ pushptr(Address(rcx, 0));          // Save return address
3115   __ enter();                           // Save old & set new ebp
3116   __ subptr(rsp, rbx);                  // Prolog
3117   // This value is corrected by layout_activation_impl
3118   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3119   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
3120   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
3121   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
3122   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
3123   __ decrementl(rdx);                   // Decrement counter
3124   __ jcc(Assembler::notZero, loop);
3125   __ pushptr(Address(rcx, 0));          // Save final return address
3126 
3127   // Re-push self-frame
3128   __ enter();                           // Save old & set new ebp
3129 
3130   // Allocate a full sized register save area.
3131   // Return address and rbp are in place, so we allocate two less words.
3132   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
3133 
3134   // Restore frame locals after moving the frame
3135   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
3136   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3137 
3138   // Call C code.  Need thread but NOT official VM entry
3139   // crud.  We cannot block on this call, no GC can happen.  Call should
3140   // restore return values to their stack-slots with the new SP.
3141   //
3142   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
3143 
3144   // Use rbp because the frames look interpreted now
3145   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3146   // Don't need the precise return PC here, just precise enough to point into this code blob.
3147   address the_pc = __ pc();
3148   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3149 
3150   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
3151   __ mov(c_rarg0, r15_thread);
3152   __ movl(c_rarg1, r14); // second arg: exec_mode
3153   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3154   // Revert SP alignment after call since we're going to do some SP relative addressing below
3155   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
3156 
3157   // Set an oopmap for the call site
3158   // Use the same PC we used for the last java frame
3159   oop_maps->add_gc_map(the_pc - start,
3160                        new OopMap( frame_size_in_words, 0 ));
3161 
3162   // Clear fp AND pc
3163   __ reset_last_Java_frame(true);
3164 
3165   // Collect return values
3166   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
3167   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
3168   // I think this is useless (throwing pc?)
3169   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
3170 
3171   // Pop self-frame.
3172   __ leave();                           // Epilog
3173 
3174   // Jump to interpreter
3175   __ ret(0);
3176 
3177   // Make sure all code is generated
3178   masm->flush();
3179 
3180   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
3181   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
3182 #if INCLUDE_JVMCI
3183   if (EnableJVMCI) {
3184     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
3185     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
3186   }
3187 #endif
3188 
3189   AOTCodeCache::store_code_blob(*_deopt_blob, AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
3190 }
3191 
3192 //------------------------------generate_handler_blob------
3193 //
3194 // Generate a special Compile2Runtime blob that saves all registers,
3195 // and setup oopmap.
3196 //
3197 SafepointBlob* SharedRuntime::generate_handler_blob(StubId id, address call_ptr) {
3198   assert(StubRoutines::forward_exception_entry() != nullptr,
3199          "must be generated before");
3200   assert(is_polling_page_id(id), "expected a polling page stub id");
3201 
3202   // Allocate space for the code.  Setup code generation tools.
3203   const char* name = SharedRuntime::stub_name(id);
3204   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3205   if (blob != nullptr) {
3206     return blob->as_safepoint_blob();
3207   }
3208 
3209   ResourceMark rm;
3210   OopMapSet *oop_maps = new OopMapSet();
3211   OopMap* map;
3212   CodeBuffer buffer(name, 2548, 1024);
3213   MacroAssembler* masm = new MacroAssembler(&buffer);
3214 
3215   address start   = __ pc();
3216   address call_pc = nullptr;
3217   int frame_size_in_words;
3218   bool cause_return = (id == StubId::shared_polling_page_return_handler_id);
3219   bool save_wide_vectors = (id == StubId::shared_polling_page_vectors_safepoint_handler_id);
3220 
3221   // Make room for return address (or push it again)
3222   if (!cause_return) {
3223     __ push(rbx);
3224   }
3225 
3226   // Save registers, fpu state, and flags
3227   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3228 
3229   // The following is basically a call_VM.  However, we need the precise
3230   // address of the call in order to generate an oopmap. Hence, we do all the
3231   // work ourselves.
3232 
3233   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3234 
3235   // The return address must always be correct so that frame constructor never
3236   // sees an invalid pc.
3237 
3238   if (!cause_return) {
3239     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3240     // Additionally, rbx is a callee saved register and we can look at it later to determine
3241     // if someone changed the return address for us!
3242     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3243     __ movptr(Address(rbp, wordSize), rbx);
3244   }
3245 
3246   // Do the call
3247   __ mov(c_rarg0, r15_thread);
3248   __ call(RuntimeAddress(call_ptr));
3249 
3250   // Set an oopmap for the call site.  This oopmap will map all
3251   // oop-registers and debug-info registers as callee-saved.  This
3252   // will allow deoptimization at this safepoint to find all possible
3253   // debug-info recordings, as well as let GC find all oops.
3254 
3255   oop_maps->add_gc_map( __ pc() - start, map);
3256 
3257   Label noException;
3258 
3259   __ reset_last_Java_frame(false);
3260 
3261   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3262   __ jcc(Assembler::equal, noException);
3263 
3264   // Exception pending
3265 
3266   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3267 
3268   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3269 
3270   // No exception case
3271   __ bind(noException);
3272 
3273   Label no_adjust;
3274 #ifdef ASSERT
3275   Label bail;
3276 #endif
3277   if (!cause_return) {
3278     Label no_prefix, not_special, check_rex_prefix;
3279 
3280     // If our stashed return pc was modified by the runtime we avoid touching it
3281     __ cmpptr(rbx, Address(rbp, wordSize));
3282     __ jcc(Assembler::notEqual, no_adjust);
3283 
3284     // Skip over the poll instruction.
3285     // See NativeInstruction::is_safepoint_poll()
3286     // Possible encodings:
3287     //      85 00       test   %eax,(%rax)
3288     //      85 01       test   %eax,(%rcx)
3289     //      85 02       test   %eax,(%rdx)
3290     //      85 03       test   %eax,(%rbx)
3291     //      85 06       test   %eax,(%rsi)
3292     //      85 07       test   %eax,(%rdi)
3293     //
3294     //   41 85 00       test   %eax,(%r8)
3295     //   41 85 01       test   %eax,(%r9)
3296     //   41 85 02       test   %eax,(%r10)
3297     //   41 85 03       test   %eax,(%r11)
3298     //   41 85 06       test   %eax,(%r14)
3299     //   41 85 07       test   %eax,(%r15)
3300     //
3301     //      85 04 24    test   %eax,(%rsp)
3302     //   41 85 04 24    test   %eax,(%r12)
3303     //      85 45 00    test   %eax,0x0(%rbp)
3304     //   41 85 45 00    test   %eax,0x0(%r13)
3305     //
3306     // Notes:
3307     //  Format of legacy MAP0 test instruction:-
3308     //  [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32]
3309     //  o  For safepoint polling instruction "test %eax,(%rax)", encoding of first register
3310     //     operand and base register of memory operand is b/w [0-8), hence we do not require
3311     //     additional REX prefix where REX.B bit stores MSB bit of register encoding, which
3312     //     is why two bytes encoding is sufficient here.
3313     //  o  For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE
3314     //     register of memory operand is 1000, thus we need additional REX prefix in this case,
3315     //     there by adding additional byte to instruction encoding.
3316     //  o  In case BASE register is one of the 32 extended GPR registers available only on targets
3317     //     supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold
3318     //     most significant two bits of 5 bit register encoding.
3319 
3320     if (VM_Version::supports_apx_f()) {
3321       __ cmpb(Address(rbx, 0), Assembler::REX2);
3322       __ jccb(Assembler::notEqual, check_rex_prefix);
3323       __ addptr(rbx, 2);
3324       __ bind(check_rex_prefix);
3325     }
3326     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3327     __ jccb(Assembler::notEqual, no_prefix);
3328     __ addptr(rbx, 1);
3329     __ bind(no_prefix);
3330 #ifdef ASSERT
3331     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3332 #endif
3333     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3334     // r12/rsp 0x04
3335     // r13/rbp 0x05
3336     __ movzbq(rcx, Address(rbx, 1));
3337     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3338     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3339     __ cmpptr(rcx, 1);
3340     __ jccb(Assembler::above, not_special);
3341     __ addptr(rbx, 1);
3342     __ bind(not_special);
3343 #ifdef ASSERT
3344     // Verify the correct encoding of the poll we're about to skip.
3345     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3346     __ jcc(Assembler::notEqual, bail);
3347     // Mask out the modrm bits
3348     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3349     // rax encodes to 0, so if the bits are nonzero it's incorrect
3350     __ jcc(Assembler::notZero, bail);
3351 #endif
3352     // Adjust return pc forward to step over the safepoint poll instruction
3353     __ addptr(rbx, 2);
3354     __ movptr(Address(rbp, wordSize), rbx);
3355   }
3356 
3357   __ bind(no_adjust);
3358   // Normal exit, restore registers and exit.
3359   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3360   __ ret(0);
3361 
3362 #ifdef ASSERT
3363   __ bind(bail);
3364   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3365 #endif
3366 
3367   // Make sure all code is generated
3368   masm->flush();
3369 
3370   // Fill-out other meta info
3371   SafepointBlob* sp_blob = SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3372 
3373   AOTCodeCache::store_code_blob(*sp_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3374   return sp_blob;
3375 }
3376 
3377 //
3378 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3379 //
3380 // Generate a stub that calls into vm to find out the proper destination
3381 // of a java call. All the argument registers are live at this point
3382 // but since this is generic code we don't know what they are and the caller
3383 // must do any gc of the args.
3384 //
3385 RuntimeStub* SharedRuntime::generate_resolve_blob(StubId id, address destination) {
3386   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3387   assert(is_resolve_id(id), "expected a resolve stub id");
3388 
3389   const char* name = SharedRuntime::stub_name(id);
3390   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3391   if (blob != nullptr) {
3392     return blob->as_runtime_stub();
3393   }
3394 
3395   // allocate space for the code
3396   ResourceMark rm;
3397   CodeBuffer buffer(name, 1552, 512);
3398   MacroAssembler* masm = new MacroAssembler(&buffer);
3399 
3400   int frame_size_in_words;
3401 
3402   OopMapSet *oop_maps = new OopMapSet();
3403   OopMap* map = nullptr;
3404 
3405   int start = __ offset();
3406 
3407   // No need to save vector registers since they are caller-saved anyway.
3408   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3409 
3410   int frame_complete = __ offset();
3411 
3412   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3413 
3414   __ mov(c_rarg0, r15_thread);
3415 
3416   __ call(RuntimeAddress(destination));
3417 
3418 
3419   // Set an oopmap for the call site.
3420   // We need this not only for callee-saved registers, but also for volatile
3421   // registers that the compiler might be keeping live across a safepoint.
3422 
3423   oop_maps->add_gc_map( __ offset() - start, map);
3424 
3425   // rax contains the address we are going to jump to assuming no exception got installed
3426 
3427   // clear last_Java_sp
3428   __ reset_last_Java_frame(false);
3429   // check for pending exceptions
3430   Label pending;
3431   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3432   __ jcc(Assembler::notEqual, pending);
3433 
3434   // get the returned Method*
3435   __ get_vm_result_metadata(rbx);
3436   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3437 
3438   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3439 
3440   RegisterSaver::restore_live_registers(masm);
3441 
3442   // We are back to the original state on entry and ready to go.
3443 
3444   __ jmp(rax);
3445 
3446   // Pending exception after the safepoint
3447 
3448   __ bind(pending);
3449 
3450   RegisterSaver::restore_live_registers(masm);
3451 
3452   // exception pending => remove activation and forward to exception handler
3453 
3454   __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
3455 
3456   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3457   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3458 
3459   // -------------
3460   // make sure all code is generated
3461   masm->flush();
3462 
3463   // return the  blob
3464   // frame_size_words or bytes??
3465   RuntimeStub* rs_blob = RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3466 
3467   AOTCodeCache::store_code_blob(*rs_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3468   return rs_blob;
3469 }
3470 
3471 // Continuation point for throwing of implicit exceptions that are
3472 // not handled in the current activation. Fabricates an exception
3473 // oop and initiates normal exception dispatching in this
3474 // frame. Since we need to preserve callee-saved values (currently
3475 // only for C2, but done for C1 as well) we need a callee-saved oop
3476 // map and therefore have to make these stubs into RuntimeStubs
3477 // rather than BufferBlobs.  If the compiler needs all registers to
3478 // be preserved between the fault point and the exception handler
3479 // then it must assume responsibility for that in
3480 // AbstractCompiler::continuation_for_implicit_null_exception or
3481 // continuation_for_implicit_division_by_zero_exception. All other
3482 // implicit exceptions (e.g., NullPointerException or
3483 // AbstractMethodError on entry) are either at call sites or
3484 // otherwise assume that stack unwinding will be initiated, so
3485 // caller saved registers were assumed volatile in the compiler.
3486 RuntimeStub* SharedRuntime::generate_throw_exception(StubId id, address runtime_entry) {
3487   assert(is_throw_id(id), "expected a throw stub id");
3488 
3489   const char* name = SharedRuntime::stub_name(id);
3490 
3491   // Information about frame layout at time of blocking runtime call.
3492   // Note that we only have to preserve callee-saved registers since
3493   // the compilers are responsible for supplying a continuation point
3494   // if they expect all registers to be preserved.
3495   enum layout {
3496     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3497     rbp_off2,
3498     return_off,
3499     return_off2,
3500     framesize // inclusive of return address
3501   };
3502 
3503   int insts_size = 512;
3504   int locs_size  = 64;
3505 
3506   const char* timer_msg = "SharedRuntime generate_throw_exception";
3507   TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3508 
3509   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3510   if (blob != nullptr) {
3511     return blob->as_runtime_stub();
3512   }
3513 
3514   ResourceMark rm;
3515   CodeBuffer code(name, insts_size, locs_size);
3516   OopMapSet* oop_maps  = new OopMapSet();
3517   MacroAssembler* masm = new MacroAssembler(&code);
3518 
3519   address start = __ pc();
3520 
3521   // This is an inlined and slightly modified version of call_VM
3522   // which has the ability to fetch the return PC out of
3523   // thread-local storage and also sets up last_Java_sp slightly
3524   // differently than the real call_VM
3525 
3526   __ enter(); // required for proper stackwalking of RuntimeStub frame
3527 
3528   assert(is_even(framesize/2), "sp not 16-byte aligned");
3529 
3530   // return address and rbp are already in place
3531   __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3532 
3533   int frame_complete = __ pc() - start;
3534 
3535   // Set up last_Java_sp and last_Java_fp
3536   address the_pc = __ pc();
3537   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3538   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3539 
3540   // Call runtime
3541   __ movptr(c_rarg0, r15_thread);
3542   BLOCK_COMMENT("call runtime_entry");
3543   __ call(RuntimeAddress(runtime_entry));
3544 
3545   // Generate oop map
3546   OopMap* map = new OopMap(framesize, 0);
3547 
3548   oop_maps->add_gc_map(the_pc - start, map);
3549 
3550   __ reset_last_Java_frame(true);
3551 
3552   __ leave(); // required for proper stackwalking of RuntimeStub frame
3553 
3554   // check for pending exceptions
3555 #ifdef ASSERT
3556   Label L;
3557   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3558   __ jcc(Assembler::notEqual, L);
3559   __ should_not_reach_here();
3560   __ bind(L);
3561 #endif // ASSERT
3562   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3563 
3564 
3565   // codeBlob framesize is in words (not VMRegImpl::slot_size)
3566   RuntimeStub* stub =
3567     RuntimeStub::new_runtime_stub(name,
3568                                   &code,
3569                                   frame_complete,
3570                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3571                                   oop_maps, false);
3572   AOTCodeCache::store_code_blob(*stub, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3573 
3574   return stub;
3575 }
3576 
3577 //------------------------------Montgomery multiplication------------------------
3578 //
3579 
3580 #ifndef _WINDOWS
3581 
3582 // Subtract 0:b from carry:a.  Return carry.
3583 static julong
3584 sub(julong a[], julong b[], julong carry, long len) {
3585   long long i = 0, cnt = len;
3586   julong tmp;
3587   asm volatile("clc; "
3588                "0: ; "
3589                "mov (%[b], %[i], 8), %[tmp]; "
3590                "sbb %[tmp], (%[a], %[i], 8); "
3591                "inc %[i]; dec %[cnt]; "
3592                "jne 0b; "
3593                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3594                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3595                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3596                : "memory");
3597   return tmp;
3598 }
3599 
3600 // Multiply (unsigned) Long A by Long B, accumulating the double-
3601 // length result into the accumulator formed of T0, T1, and T2.
3602 #define MACC(A, B, T0, T1, T2)                                  \
3603 do {                                                            \
3604   unsigned long hi, lo;                                         \
3605   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3606            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3607            : "r"(A), "a"(B) : "cc");                            \
3608  } while(0)
3609 
3610 // As above, but add twice the double-length result into the
3611 // accumulator.
3612 #define MACC2(A, B, T0, T1, T2)                                 \
3613 do {                                                            \
3614   unsigned long hi, lo;                                         \
3615   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3616            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3617            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3618            : "r"(A), "a"(B) : "cc");                            \
3619  } while(0)
3620 
3621 #else //_WINDOWS
3622 
3623 static julong
3624 sub(julong a[], julong b[], julong carry, long len) {
3625   long i;
3626   julong tmp;
3627   unsigned char c = 1;
3628   for (i = 0; i < len; i++) {
3629     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3630     a[i] = tmp;
3631   }
3632   c = _addcarry_u64(c, carry, ~0, &tmp);
3633   return tmp;
3634 }
3635 
3636 // Multiply (unsigned) Long A by Long B, accumulating the double-
3637 // length result into the accumulator formed of T0, T1, and T2.
3638 #define MACC(A, B, T0, T1, T2)                          \
3639 do {                                                    \
3640   julong hi, lo;                            \
3641   lo = _umul128(A, B, &hi);                             \
3642   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3643   c = _addcarry_u64(c, hi, T1, &T1);                    \
3644   _addcarry_u64(c, T2, 0, &T2);                         \
3645  } while(0)
3646 
3647 // As above, but add twice the double-length result into the
3648 // accumulator.
3649 #define MACC2(A, B, T0, T1, T2)                         \
3650 do {                                                    \
3651   julong hi, lo;                            \
3652   lo = _umul128(A, B, &hi);                             \
3653   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3654   c = _addcarry_u64(c, hi, T1, &T1);                    \
3655   _addcarry_u64(c, T2, 0, &T2);                         \
3656   c = _addcarry_u64(0, lo, T0, &T0);                    \
3657   c = _addcarry_u64(c, hi, T1, &T1);                    \
3658   _addcarry_u64(c, T2, 0, &T2);                         \
3659  } while(0)
3660 
3661 #endif //_WINDOWS
3662 
3663 // Fast Montgomery multiplication.  The derivation of the algorithm is
3664 // in  A Cryptographic Library for the Motorola DSP56000,
3665 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3666 
3667 static void NOINLINE
3668 montgomery_multiply(julong a[], julong b[], julong n[],
3669                     julong m[], julong inv, int len) {
3670   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3671   int i;
3672 
3673   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3674 
3675   for (i = 0; i < len; i++) {
3676     int j;
3677     for (j = 0; j < i; j++) {
3678       MACC(a[j], b[i-j], t0, t1, t2);
3679       MACC(m[j], n[i-j], t0, t1, t2);
3680     }
3681     MACC(a[i], b[0], t0, t1, t2);
3682     m[i] = t0 * inv;
3683     MACC(m[i], n[0], t0, t1, t2);
3684 
3685     assert(t0 == 0, "broken Montgomery multiply");
3686 
3687     t0 = t1; t1 = t2; t2 = 0;
3688   }
3689 
3690   for (i = len; i < 2*len; i++) {
3691     int j;
3692     for (j = i-len+1; j < len; j++) {
3693       MACC(a[j], b[i-j], t0, t1, t2);
3694       MACC(m[j], n[i-j], t0, t1, t2);
3695     }
3696     m[i-len] = t0;
3697     t0 = t1; t1 = t2; t2 = 0;
3698   }
3699 
3700   while (t0)
3701     t0 = sub(m, n, t0, len);
3702 }
3703 
3704 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3705 // multiplies so it should be up to 25% faster than Montgomery
3706 // multiplication.  However, its loop control is more complex and it
3707 // may actually run slower on some machines.
3708 
3709 static void NOINLINE
3710 montgomery_square(julong a[], julong n[],
3711                   julong m[], julong inv, int len) {
3712   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3713   int i;
3714 
3715   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3716 
3717   for (i = 0; i < len; i++) {
3718     int j;
3719     int end = (i+1)/2;
3720     for (j = 0; j < end; j++) {
3721       MACC2(a[j], a[i-j], t0, t1, t2);
3722       MACC(m[j], n[i-j], t0, t1, t2);
3723     }
3724     if ((i & 1) == 0) {
3725       MACC(a[j], a[j], t0, t1, t2);
3726     }
3727     for (; j < i; j++) {
3728       MACC(m[j], n[i-j], t0, t1, t2);
3729     }
3730     m[i] = t0 * inv;
3731     MACC(m[i], n[0], t0, t1, t2);
3732 
3733     assert(t0 == 0, "broken Montgomery square");
3734 
3735     t0 = t1; t1 = t2; t2 = 0;
3736   }
3737 
3738   for (i = len; i < 2*len; i++) {
3739     int start = i-len+1;
3740     int end = start + (len - start)/2;
3741     int j;
3742     for (j = start; j < end; j++) {
3743       MACC2(a[j], a[i-j], t0, t1, t2);
3744       MACC(m[j], n[i-j], t0, t1, t2);
3745     }
3746     if ((i & 1) == 0) {
3747       MACC(a[j], a[j], t0, t1, t2);
3748     }
3749     for (; j < len; j++) {
3750       MACC(m[j], n[i-j], t0, t1, t2);
3751     }
3752     m[i-len] = t0;
3753     t0 = t1; t1 = t2; t2 = 0;
3754   }
3755 
3756   while (t0)
3757     t0 = sub(m, n, t0, len);
3758 }
3759 
3760 // Swap words in a longword.
3761 static julong swap(julong x) {
3762   return (x << 32) | (x >> 32);
3763 }
3764 
3765 // Copy len longwords from s to d, word-swapping as we go.  The
3766 // destination array is reversed.
3767 static void reverse_words(julong *s, julong *d, int len) {
3768   d += len;
3769   while(len-- > 0) {
3770     d--;
3771     *d = swap(*s);
3772     s++;
3773   }
3774 }
3775 
3776 // The threshold at which squaring is advantageous was determined
3777 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3778 #define MONTGOMERY_SQUARING_THRESHOLD 64
3779 
3780 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3781                                         jint len, jlong inv,
3782                                         jint *m_ints) {
3783   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3784   int longwords = len/2;
3785 
3786   // Make very sure we don't use so much space that the stack might
3787   // overflow.  512 jints corresponds to an 16384-bit integer and
3788   // will use here a total of 8k bytes of stack space.
3789   int divisor = sizeof(julong) * 4;
3790   guarantee(longwords <= 8192 / divisor, "must be");
3791   int total_allocation = longwords * sizeof (julong) * 4;
3792   julong *scratch = (julong *)alloca(total_allocation);
3793 
3794   // Local scratch arrays
3795   julong
3796     *a = scratch + 0 * longwords,
3797     *b = scratch + 1 * longwords,
3798     *n = scratch + 2 * longwords,
3799     *m = scratch + 3 * longwords;
3800 
3801   reverse_words((julong *)a_ints, a, longwords);
3802   reverse_words((julong *)b_ints, b, longwords);
3803   reverse_words((julong *)n_ints, n, longwords);
3804 
3805   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3806 
3807   reverse_words(m, (julong *)m_ints, longwords);
3808 }
3809 
3810 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3811                                       jint len, jlong inv,
3812                                       jint *m_ints) {
3813   assert(len % 2 == 0, "array length in montgomery_square must be even");
3814   int longwords = len/2;
3815 
3816   // Make very sure we don't use so much space that the stack might
3817   // overflow.  512 jints corresponds to an 16384-bit integer and
3818   // will use here a total of 6k bytes of stack space.
3819   int divisor = sizeof(julong) * 3;
3820   guarantee(longwords <= (8192 / divisor), "must be");
3821   int total_allocation = longwords * sizeof (julong) * 3;
3822   julong *scratch = (julong *)alloca(total_allocation);
3823 
3824   // Local scratch arrays
3825   julong
3826     *a = scratch + 0 * longwords,
3827     *n = scratch + 1 * longwords,
3828     *m = scratch + 2 * longwords;
3829 
3830   reverse_words((julong *)a_ints, a, longwords);
3831   reverse_words((julong *)n_ints, n, longwords);
3832 
3833   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3834     ::montgomery_square(a, n, m, (julong)inv, longwords);
3835   } else {
3836     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3837   }
3838 
3839   reverse_words(m, (julong *)m_ints, longwords);
3840 }
3841 
3842 BufferedInlineTypeBlob* SharedRuntime::generate_buffered_inline_type_adapter(const InlineKlass* vk) {
3843   BufferBlob* buf = BufferBlob::create("inline types pack/unpack", 16 * K);
3844   if (buf == nullptr) {
3845     return nullptr;
3846   }
3847   CodeBuffer buffer(buf);
3848   short buffer_locs[20];
3849   buffer.insts()->initialize_shared_locs((relocInfo*)buffer_locs,
3850                                          sizeof(buffer_locs)/sizeof(relocInfo));
3851 
3852   MacroAssembler* masm = new MacroAssembler(&buffer);
3853 
3854   const Array<SigEntry>* sig_vk = vk->extended_sig();
3855   const Array<VMRegPair>* regs = vk->return_regs();
3856 
3857   int pack_fields_jobject_off = __ offset();
3858   // Resolve pre-allocated buffer from JNI handle.
3859   // We cannot do this in generate_call_stub() because it requires GC code to be initialized.
3860   __ movptr(rax, Address(r13, 0));
3861   __ resolve_jobject(rax /* value */,
3862                      r12 /* tmp */);
3863   __ movptr(Address(r13, 0), rax);
3864 
3865   int pack_fields_off = __ offset();
3866 
3867   int j = 1;
3868   for (int i = 0; i < sig_vk->length(); i++) {
3869     BasicType bt = sig_vk->at(i)._bt;
3870     if (bt == T_METADATA) {
3871       continue;
3872     }
3873     if (bt == T_VOID) {
3874       if (sig_vk->at(i-1)._bt == T_LONG ||
3875           sig_vk->at(i-1)._bt == T_DOUBLE) {
3876         j++;
3877       }
3878       continue;
3879     }
3880     int off = sig_vk->at(i)._offset;
3881     assert(off > 0, "offset in object should be positive");
3882     VMRegPair pair = regs->at(j);
3883     VMReg r_1 = pair.first();
3884     VMReg r_2 = pair.second();
3885     Address to(rax, off);
3886     if (bt == T_FLOAT) {
3887       __ movflt(to, r_1->as_XMMRegister());
3888     } else if (bt == T_DOUBLE) {
3889       __ movdbl(to, r_1->as_XMMRegister());
3890     } else {
3891       Register val = r_1->as_Register();
3892       assert_different_registers(to.base(), val, r14, r13, rbx, rscratch1);
3893       if (is_reference_type(bt)) {
3894         __ store_heap_oop(to, val, r14, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
3895       } else {
3896         __ store_sized_value(to, r_1->as_Register(), type2aelembytes(bt));
3897       }
3898     }
3899     j++;
3900   }
3901   assert(j == regs->length(), "missed a field?");
3902   if (vk->has_nullable_atomic_layout()) {
3903     // Set the null marker
3904     __ movb(Address(rax, vk->null_marker_offset()), 1);
3905   }
3906   __ ret(0);
3907 
3908   int unpack_fields_off = __ offset();
3909 
3910   Label skip;
3911   Label not_null;
3912   __ testptr(rax, rax);
3913   __ jcc(Assembler::notZero, not_null);
3914 
3915   // Return value is null. Zero oop registers to make the GC happy.
3916   j = 1;
3917   for (int i = 0; i < sig_vk->length(); i++) {
3918     BasicType bt = sig_vk->at(i)._bt;
3919     if (bt == T_METADATA) {
3920       continue;
3921     }
3922     if (bt == T_VOID) {
3923       if (sig_vk->at(i-1)._bt == T_LONG ||
3924           sig_vk->at(i-1)._bt == T_DOUBLE) {
3925         j++;
3926       }
3927       continue;
3928     }
3929     if (bt == T_OBJECT || bt == T_ARRAY) {
3930       VMRegPair pair = regs->at(j);
3931       VMReg r_1 = pair.first();
3932       __ xorq(r_1->as_Register(), r_1->as_Register());
3933     }
3934     j++;
3935   }
3936   __ jmp(skip);
3937   __ bind(not_null);
3938 
3939   j = 1;
3940   for (int i = 0; i < sig_vk->length(); i++) {
3941     BasicType bt = sig_vk->at(i)._bt;
3942     if (bt == T_METADATA) {
3943       continue;
3944     }
3945     if (bt == T_VOID) {
3946       if (sig_vk->at(i-1)._bt == T_LONG ||
3947           sig_vk->at(i-1)._bt == T_DOUBLE) {
3948         j++;
3949       }
3950       continue;
3951     }
3952     int off = sig_vk->at(i)._offset;
3953     assert(off > 0, "offset in object should be positive");
3954     VMRegPair pair = regs->at(j);
3955     VMReg r_1 = pair.first();
3956     VMReg r_2 = pair.second();
3957     Address from(rax, off);
3958     if (bt == T_FLOAT) {
3959       __ movflt(r_1->as_XMMRegister(), from);
3960     } else if (bt == T_DOUBLE) {
3961       __ movdbl(r_1->as_XMMRegister(), from);
3962     } else if (bt == T_OBJECT || bt == T_ARRAY) {
3963       assert_different_registers(rax, r_1->as_Register());
3964       __ load_heap_oop(r_1->as_Register(), from);
3965     } else {
3966       assert(is_java_primitive(bt), "unexpected basic type");
3967       assert_different_registers(rax, r_1->as_Register());
3968       size_t size_in_bytes = type2aelembytes(bt);
3969       __ load_sized_value(r_1->as_Register(), from, size_in_bytes, bt != T_CHAR && bt != T_BOOLEAN);
3970     }
3971     j++;
3972   }
3973   assert(j == regs->length(), "missed a field?");
3974 
3975   __ bind(skip);
3976   __ ret(0);
3977 
3978   __ flush();
3979 
3980   return BufferedInlineTypeBlob::create(&buffer, pack_fields_off, pack_fields_jobject_off, unpack_fields_off);
3981 }
3982 
3983 #if INCLUDE_JFR
3984 
3985 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
3986 // It returns a jobject handle to the event writer.
3987 // The handle is dereferenced and the return value is the event writer oop.
3988 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
3989   enum layout {
3990     rbp_off,
3991     rbpH_off,
3992     return_off,
3993     return_off2,
3994     framesize // inclusive of return address
3995   };
3996 
3997   const char* name = SharedRuntime::stub_name(StubId::shared_jfr_write_checkpoint_id);
3998   CodeBuffer code(name, 1024, 64);
3999   MacroAssembler* masm = new MacroAssembler(&code);
4000   address start = __ pc();
4001 
4002   __ enter();
4003   address the_pc = __ pc();
4004 
4005   int frame_complete = the_pc - start;
4006 
4007   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
4008   __ movptr(c_rarg0, r15_thread);
4009   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
4010   __ reset_last_Java_frame(true);
4011 
4012   // rax is jobject handle result, unpack and process it through a barrier.
4013   __ resolve_global_jobject(rax, c_rarg0);
4014 
4015   __ leave();
4016   __ ret(0);
4017 
4018   OopMapSet* oop_maps = new OopMapSet();
4019   OopMap* map = new OopMap(framesize, 1);
4020   oop_maps->add_gc_map(frame_complete, map);
4021 
4022   RuntimeStub* stub =
4023     RuntimeStub::new_runtime_stub(name,
4024                                   &code,
4025                                   frame_complete,
4026                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4027                                   oop_maps,
4028                                   false);
4029   return stub;
4030 }
4031 
4032 // For c2: call to return a leased buffer.
4033 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
4034   enum layout {
4035     rbp_off,
4036     rbpH_off,
4037     return_off,
4038     return_off2,
4039     framesize // inclusive of return address
4040   };
4041 
4042   const char* name = SharedRuntime::stub_name(StubId::shared_jfr_return_lease_id);
4043   CodeBuffer code(name, 1024, 64);
4044   MacroAssembler* masm = new MacroAssembler(&code);
4045   address start = __ pc();
4046 
4047   __ enter();
4048   address the_pc = __ pc();
4049 
4050   int frame_complete = the_pc - start;
4051 
4052   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
4053   __ movptr(c_rarg0, r15_thread);
4054   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
4055   __ reset_last_Java_frame(true);
4056 
4057   __ leave();
4058   __ ret(0);
4059 
4060   OopMapSet* oop_maps = new OopMapSet();
4061   OopMap* map = new OopMap(framesize, 1);
4062   oop_maps->add_gc_map(frame_complete, map);
4063 
4064   RuntimeStub* stub =
4065     RuntimeStub::new_runtime_stub(name,
4066                                   &code,
4067                                   frame_complete,
4068                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4069                                   oop_maps,
4070                                   false);
4071   return stub;
4072 }
4073 
4074 #endif // INCLUDE_JFR