1 /*
   2  * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef _WINDOWS
  26 #include "alloca.h"
  27 #endif
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "classfile/symbolTable.hpp"
  31 #include "code/aotCodeCache.hpp"
  32 #include "code/compiledIC.hpp"
  33 #include "code/debugInfoRec.hpp"
  34 #include "code/nativeInst.hpp"
  35 #include "code/vtableStubs.hpp"
  36 #include "compiler/oopMap.hpp"
  37 #include "gc/shared/collectedHeap.hpp"
  38 #include "gc/shared/gcLocker.hpp"
  39 #include "gc/shared/barrierSet.hpp"
  40 #include "gc/shared/barrierSetAssembler.hpp"
  41 #include "interpreter/interpreter.hpp"
  42 #include "logging/log.hpp"
  43 #include "memory/resourceArea.hpp"
  44 #include "memory/universe.hpp"
  45 #include "oops/klass.inline.hpp"
  46 #include "oops/method.inline.hpp"
  47 #include "prims/methodHandles.hpp"
  48 #include "runtime/continuation.hpp"
  49 #include "runtime/continuationEntry.inline.hpp"
  50 #include "runtime/globals.hpp"
  51 #include "runtime/jniHandles.hpp"
  52 #include "runtime/safepointMechanism.hpp"
  53 #include "runtime/sharedRuntime.hpp"
  54 #include "runtime/signature.hpp"
  55 #include "runtime/stubRoutines.hpp"
  56 #include "runtime/timerTrace.hpp"
  57 #include "runtime/vframeArray.hpp"
  58 #include "runtime/vm_version.hpp"
  59 #include "utilities/align.hpp"
  60 #include "utilities/checkedCast.hpp"
  61 #include "utilities/formatBuffer.hpp"
  62 #include "vmreg_x86.inline.hpp"
  63 #ifdef COMPILER1
  64 #include "c1/c1_Runtime1.hpp"
  65 #endif
  66 #ifdef COMPILER2
  67 #include "opto/runtime.hpp"
  68 #endif
  69 
  70 #define __ masm->
  71 
  72 #ifdef PRODUCT
  73 #define BLOCK_COMMENT(str) /* nothing */
  74 #else
  75 #define BLOCK_COMMENT(str) __ block_comment(str)
  76 #endif // PRODUCT
  77 
  78 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  79 
  80 class RegisterSaver {
  81   // Capture info about frame layout.  Layout offsets are in jint
  82   // units because compiler frame slots are jints.
  83 #define XSAVE_AREA_BEGIN 160
  84 #define XSAVE_AREA_YMM_BEGIN 576
  85 #define XSAVE_AREA_EGPRS 960
  86 #define XSAVE_AREA_OPMASK_BEGIN 1088
  87 #define XSAVE_AREA_ZMM_BEGIN 1152
  88 #define XSAVE_AREA_UPPERBANK 1664
  89 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  90 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  91 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
  92 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
  93 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
  94   enum layout {
  95     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt,    // fxsave save area
  96     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,  // offset in fxsave save area
  97     DEF_XMM_OFFS(0),
  98     DEF_XMM_OFFS(1),
  99     // 2..15 are implied in range usage
 100     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 101     DEF_YMM_OFFS(0),
 102     DEF_YMM_OFFS(1),
 103     r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
 104     r16H_off,
 105     r17_off, r17H_off,
 106     r18_off, r18H_off,
 107     r19_off, r19H_off,
 108     r20_off, r20H_off,
 109     r21_off, r21H_off,
 110     r22_off, r22H_off,
 111     r23_off, r23H_off,
 112     r24_off, r24H_off,
 113     r25_off, r25H_off,
 114     r26_off, r26H_off,
 115     r27_off, r27H_off,
 116     r28_off, r28H_off,
 117     r29_off, r29H_off,
 118     r30_off, r30H_off,
 119     r31_off, r31H_off,
 120     opmask_off   = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 121     DEF_OPMASK_OFFS(0),
 122     DEF_OPMASK_OFFS(1),
 123     // 2..7 are implied in range usage
 124     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 125     DEF_ZMM_OFFS(0),
 126     DEF_ZMM_OFFS(1),
 127     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 128     DEF_ZMM_UPPER_OFFS(16),
 129     DEF_ZMM_UPPER_OFFS(17),
 130     // 18..31 are implied in range usage
 131     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 132     fpu_stateH_end,
 133     r15_off, r15H_off,
 134     r14_off, r14H_off,
 135     r13_off, r13H_off,
 136     r12_off, r12H_off,
 137     r11_off, r11H_off,
 138     r10_off, r10H_off,
 139     r9_off,  r9H_off,
 140     r8_off,  r8H_off,
 141     rdi_off, rdiH_off,
 142     rsi_off, rsiH_off,
 143     ignore_off, ignoreH_off,  // extra copy of rbp
 144     rsp_off, rspH_off,
 145     rbx_off, rbxH_off,
 146     rdx_off, rdxH_off,
 147     rcx_off, rcxH_off,
 148     rax_off, raxH_off,
 149     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 150     align_off, alignH_off,
 151     flags_off, flagsH_off,
 152     // The frame sender code expects that rbp will be in the "natural" place and
 153     // will override any oopMap setting for it. We must therefore force the layout
 154     // so that it agrees with the frame sender code.
 155     rbp_off, rbpH_off,        // copy of rbp we will restore
 156     return_off, returnH_off,  // slot for return address
 157     reg_save_size             // size in compiler stack slots
 158   };
 159 
 160  public:
 161   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 162   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 163 
 164   // Offsets into the register save area
 165   // Used by deoptimization when it is managing result register
 166   // values on its own
 167 
 168   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 169   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 170   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 171   static int r15_offset_in_bytes(void)    { return BytesPerInt * r15_off; }
 172   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 173   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 174 
 175   // During deoptimization only the result registers need to be restored,
 176   // all the other values have already been extracted.
 177   static void restore_result_registers(MacroAssembler* masm);
 178 };
 179 
 180 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 181   int off = 0;
 182   int num_xmm_regs = XMMRegister::available_xmm_registers();
 183 #ifdef COMPILER2
 184   if (save_wide_vectors && UseAVX == 0) {
 185     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 186   }
 187   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 188 #else
 189   save_wide_vectors = false; // vectors are generated only by C2
 190 #endif // COMPILER2
 191 
 192   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 193   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 194   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 195   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 196   // CodeBlob frame size is in words.
 197   int frame_size_in_words = frame_size_in_bytes / wordSize;
 198   *total_frame_words = frame_size_in_words;
 199 
 200   // Save registers, fpu state, and flags.
 201   // We assume caller has already pushed the return address onto the
 202   // stack, so rsp is 8-byte aligned here.
 203   // We push rpb twice in this sequence because we want the real rbp
 204   // to be under the return like a normal enter.
 205 
 206   __ enter();          // rsp becomes 16-byte aligned here
 207   __ pushf();
 208   // Make sure rsp stays 16-byte aligned
 209   __ subq(rsp, 8);
 210   // Push CPU state in multiple of 16 bytes
 211   __ save_legacy_gprs();
 212   __ push_FPU_state();
 213 
 214 
 215   // push cpu state handles this on EVEX enabled targets
 216   if (save_wide_vectors) {
 217     // Save upper half of YMM registers(0..15)
 218     int base_addr = XSAVE_AREA_YMM_BEGIN;
 219     for (int n = 0; n < 16; n++) {
 220       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 221     }
 222     if (VM_Version::supports_evex()) {
 223       // Save upper half of ZMM registers(0..15)
 224       base_addr = XSAVE_AREA_ZMM_BEGIN;
 225       for (int n = 0; n < 16; n++) {
 226         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 227       }
 228       // Save full ZMM registers(16..num_xmm_regs)
 229       base_addr = XSAVE_AREA_UPPERBANK;
 230       off = 0;
 231       int vector_len = Assembler::AVX_512bit;
 232       for (int n = 16; n < num_xmm_regs; n++) {
 233         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 234       }
 235 #ifdef COMPILER2
 236       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 237       off = 0;
 238       for(int n = 0; n < KRegister::number_of_registers; n++) {
 239         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 240       }
 241 #endif // COMPILER2
 242     }
 243   } else {
 244     if (VM_Version::supports_evex()) {
 245       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 246       int base_addr = XSAVE_AREA_UPPERBANK;
 247       off = 0;
 248       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 249       for (int n = 16; n < num_xmm_regs; n++) {
 250         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 251       }
 252 #ifdef COMPILER2
 253       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 254       off = 0;
 255       for(int n = 0; n < KRegister::number_of_registers; n++) {
 256         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 257       }
 258 #endif // COMPILER2
 259     }
 260   }
 261 
 262 #ifdef COMPILER2
 263   if (UseAPX) {
 264       int base_addr = XSAVE_AREA_EGPRS;
 265       off = 0;
 266       for (int n = 16; n < Register::number_of_registers; n++) {
 267         __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
 268       }
 269   }
 270 #endif // COMPILER2
 271 
 272   __ vzeroupper();
 273   if (frame::arg_reg_save_area_bytes != 0) {
 274     // Allocate argument register save area
 275     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 276   }
 277 
 278   // Set an oopmap for the call site.  This oopmap will map all
 279   // oop-registers and debug-info registers as callee-saved.  This
 280   // will allow deoptimization at this safepoint to find all possible
 281   // debug-info recordings, as well as let GC find all oops.
 282 
 283   OopMapSet *oop_maps = new OopMapSet();
 284   OopMap* map = new OopMap(frame_size_in_slots, 0);
 285 
 286 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 287 
 288   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 289   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 290   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 291   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 292   // rbp location is known implicitly by the frame sender code, needs no oopmap
 293   // and the location where rbp was saved by is ignored
 294   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 295   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 296   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 297   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 298   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 299   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 300   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 301   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 302   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 303   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 304 
 305   if (UseAPX) {
 306     map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
 307     map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
 308     map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
 309     map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
 310     map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
 311     map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
 312     map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
 313     map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
 314     map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
 315     map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
 316     map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
 317     map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
 318     map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
 319     map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
 320     map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
 321     map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
 322   }
 323   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 324   // on EVEX enabled targets, we get it included in the xsave area
 325   off = xmm0_off;
 326   int delta = xmm1_off - off;
 327   for (int n = 0; n < 16; n++) {
 328     XMMRegister xmm_name = as_XMMRegister(n);
 329     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 330     off += delta;
 331   }
 332   if (UseAVX > 2) {
 333     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 334     off = zmm16_off;
 335     delta = zmm17_off - off;
 336     for (int n = 16; n < num_xmm_regs; n++) {
 337       XMMRegister zmm_name = as_XMMRegister(n);
 338       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 339       off += delta;
 340     }
 341   }
 342 
 343 #ifdef COMPILER2
 344   if (save_wide_vectors) {
 345     // Save upper half of YMM registers(0..15)
 346     off = ymm0_off;
 347     delta = ymm1_off - ymm0_off;
 348     for (int n = 0; n < 16; n++) {
 349       XMMRegister ymm_name = as_XMMRegister(n);
 350       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 351       off += delta;
 352     }
 353     if (VM_Version::supports_evex()) {
 354       // Save upper half of ZMM registers(0..15)
 355       off = zmm0_off;
 356       delta = zmm1_off - zmm0_off;
 357       for (int n = 0; n < 16; n++) {
 358         XMMRegister zmm_name = as_XMMRegister(n);
 359         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 360         off += delta;
 361       }
 362     }
 363   }
 364 #endif // COMPILER2
 365 
 366   // %%% These should all be a waste but we'll keep things as they were for now
 367   if (true) {
 368     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 369     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 370     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 371     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 372     // rbp location is known implicitly by the frame sender code, needs no oopmap
 373     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 374     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 375     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 376     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 377     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 378     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 379     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 380     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 381     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 382     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 383     if (UseAPX) {
 384       map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
 385       map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
 386       map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
 387       map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
 388       map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
 389       map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
 390       map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
 391       map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
 392       map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
 393       map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
 394       map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
 395       map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
 396       map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
 397       map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
 398       map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
 399       map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
 400     }
 401     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 402     // on EVEX enabled targets, we get it included in the xsave area
 403     off = xmm0H_off;
 404     delta = xmm1H_off - off;
 405     for (int n = 0; n < 16; n++) {
 406       XMMRegister xmm_name = as_XMMRegister(n);
 407       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 408       off += delta;
 409     }
 410     if (UseAVX > 2) {
 411       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 412       off = zmm16H_off;
 413       delta = zmm17H_off - off;
 414       for (int n = 16; n < num_xmm_regs; n++) {
 415         XMMRegister zmm_name = as_XMMRegister(n);
 416         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 417         off += delta;
 418       }
 419     }
 420   }
 421 
 422   return map;
 423 }
 424 
 425 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 426   int num_xmm_regs = XMMRegister::available_xmm_registers();
 427   if (frame::arg_reg_save_area_bytes != 0) {
 428     // Pop arg register save area
 429     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 430   }
 431 
 432 #ifdef COMPILER2
 433   if (restore_wide_vectors) {
 434     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 435     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 436   }
 437 #else
 438   assert(!restore_wide_vectors, "vectors are generated only by C2");
 439 #endif // COMPILER2
 440 
 441   __ vzeroupper();
 442 
 443   // On EVEX enabled targets everything is handled in pop fpu state
 444   if (restore_wide_vectors) {
 445     // Restore upper half of YMM registers (0..15)
 446     int base_addr = XSAVE_AREA_YMM_BEGIN;
 447     for (int n = 0; n < 16; n++) {
 448       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 449     }
 450     if (VM_Version::supports_evex()) {
 451       // Restore upper half of ZMM registers (0..15)
 452       base_addr = XSAVE_AREA_ZMM_BEGIN;
 453       for (int n = 0; n < 16; n++) {
 454         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 455       }
 456       // Restore full ZMM registers(16..num_xmm_regs)
 457       base_addr = XSAVE_AREA_UPPERBANK;
 458       int vector_len = Assembler::AVX_512bit;
 459       int off = 0;
 460       for (int n = 16; n < num_xmm_regs; n++) {
 461         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 462       }
 463 #ifdef COMPILER2
 464       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 465       off = 0;
 466       for (int n = 0; n < KRegister::number_of_registers; n++) {
 467         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 468       }
 469 #endif // COMPILER2
 470     }
 471   } else {
 472     if (VM_Version::supports_evex()) {
 473       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 474       int base_addr = XSAVE_AREA_UPPERBANK;
 475       int off = 0;
 476       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 477       for (int n = 16; n < num_xmm_regs; n++) {
 478         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 479       }
 480 #ifdef COMPILER2
 481       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 482       off = 0;
 483       for (int n = 0; n < KRegister::number_of_registers; n++) {
 484         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 485       }
 486 #endif // COMPILER2
 487     }
 488   }
 489 
 490 #ifdef COMPILER2
 491   if (UseAPX) {
 492     int base_addr = XSAVE_AREA_EGPRS;
 493     int off = 0;
 494     for (int n = 16; n < Register::number_of_registers; n++) {
 495       __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
 496     }
 497   }
 498 #endif // COMPILER2
 499 
 500   // Recover CPU state
 501   __ pop_FPU_state();
 502   __ restore_legacy_gprs();
 503   __ addq(rsp, 8);
 504   __ popf();
 505   // Get the rbp described implicitly by the calling convention (no oopMap)
 506   __ pop(rbp);
 507 }
 508 
 509 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 510 
 511   // Just restore result register. Only used by deoptimization. By
 512   // now any callee save register that needs to be restored to a c2
 513   // caller of the deoptee has been extracted into the vframeArray
 514   // and will be stuffed into the c2i adapter we create for later
 515   // restoration so only result registers need to be restored here.
 516 
 517   // Restore fp result register
 518   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 519   // Restore integer result register
 520   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 521   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 522 
 523   // Pop all of the register save are off the stack except the return address
 524   __ addptr(rsp, return_offset_in_bytes());
 525 }
 526 
 527 // Is vector's size (in bytes) bigger than a size saved by default?
 528 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 529 bool SharedRuntime::is_wide_vector(int size) {
 530   return size > 16;
 531 }
 532 
 533 // ---------------------------------------------------------------------------
 534 // Read the array of BasicTypes from a signature, and compute where the
 535 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 536 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 537 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 538 // as framesizes are fixed.
 539 // VMRegImpl::stack0 refers to the first slot 0(sp).
 540 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 541 // Register up to Register::number_of_registers are the 64-bit
 542 // integer registers.
 543 
 544 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 545 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 546 // units regardless of build. Of course for i486 there is no 64 bit build
 547 
 548 // The Java calling convention is a "shifted" version of the C ABI.
 549 // By skipping the first C ABI register we can call non-static jni methods
 550 // with small numbers of arguments without having to shuffle the arguments
 551 // at all. Since we control the java ABI we ought to at least get some
 552 // advantage out of it.
 553 
 554 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 555                                            VMRegPair *regs,
 556                                            int total_args_passed) {
 557 
 558   // Create the mapping between argument positions and
 559   // registers.
 560   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 561     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 562   };
 563   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 564     j_farg0, j_farg1, j_farg2, j_farg3,
 565     j_farg4, j_farg5, j_farg6, j_farg7
 566   };
 567 
 568 
 569   uint int_args = 0;
 570   uint fp_args = 0;
 571   uint stk_args = 0;
 572 
 573   for (int i = 0; i < total_args_passed; i++) {
 574     switch (sig_bt[i]) {
 575     case T_BOOLEAN:
 576     case T_CHAR:
 577     case T_BYTE:
 578     case T_SHORT:
 579     case T_INT:
 580       if (int_args < Argument::n_int_register_parameters_j) {
 581         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 582       } else {
 583         stk_args = align_up(stk_args, 2);
 584         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 585         stk_args += 1;
 586       }
 587       break;
 588     case T_VOID:
 589       // halves of T_LONG or T_DOUBLE
 590       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 591       regs[i].set_bad();
 592       break;
 593     case T_LONG:
 594       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 595       // fall through
 596     case T_OBJECT:
 597     case T_ARRAY:
 598     case T_ADDRESS:
 599       if (int_args < Argument::n_int_register_parameters_j) {
 600         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 601       } else {
 602         stk_args = align_up(stk_args, 2);
 603         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 604         stk_args += 2;
 605       }
 606       break;
 607     case T_FLOAT:
 608       if (fp_args < Argument::n_float_register_parameters_j) {
 609         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 610       } else {
 611         stk_args = align_up(stk_args, 2);
 612         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 613         stk_args += 1;
 614       }
 615       break;
 616     case T_DOUBLE:
 617       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 618       if (fp_args < Argument::n_float_register_parameters_j) {
 619         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 620       } else {
 621         stk_args = align_up(stk_args, 2);
 622         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 623         stk_args += 2;
 624       }
 625       break;
 626     default:
 627       ShouldNotReachHere();
 628       break;
 629     }
 630   }
 631 
 632   return stk_args;
 633 }
 634 
 635 // Same as java_calling_convention() but for multiple return
 636 // values. There's no way to store them on the stack so if we don't
 637 // have enough registers, multiple values can't be returned.
 638 const uint SharedRuntime::java_return_convention_max_int = Argument::n_int_register_parameters_j+1;
 639 const uint SharedRuntime::java_return_convention_max_float = Argument::n_float_register_parameters_j;
 640 int SharedRuntime::java_return_convention(const BasicType *sig_bt,
 641                                           VMRegPair *regs,
 642                                           int total_args_passed) {
 643   // Create the mapping between argument positions and
 644   // registers.
 645   static const Register INT_ArgReg[java_return_convention_max_int] = {
 646     rax, j_rarg5, j_rarg4, j_rarg3, j_rarg2, j_rarg1, j_rarg0
 647   };
 648   static const XMMRegister FP_ArgReg[java_return_convention_max_float] = {
 649     j_farg0, j_farg1, j_farg2, j_farg3,
 650     j_farg4, j_farg5, j_farg6, j_farg7
 651   };
 652 
 653 
 654   uint int_args = 0;
 655   uint fp_args = 0;
 656 
 657   for (int i = 0; i < total_args_passed; i++) {
 658     switch (sig_bt[i]) {
 659     case T_BOOLEAN:
 660     case T_CHAR:
 661     case T_BYTE:
 662     case T_SHORT:
 663     case T_INT:
 664       if (int_args < Argument::n_int_register_parameters_j+1) {
 665         regs[i].set1(INT_ArgReg[int_args]->as_VMReg());
 666         int_args++;
 667       } else {
 668         return -1;
 669       }
 670       break;
 671     case T_VOID:
 672       // halves of T_LONG or T_DOUBLE
 673       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 674       regs[i].set_bad();
 675       break;
 676     case T_LONG:
 677       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 678       // fall through
 679     case T_OBJECT:
 680     case T_ARRAY:
 681     case T_ADDRESS:
 682     case T_METADATA:
 683       if (int_args < Argument::n_int_register_parameters_j+1) {
 684         regs[i].set2(INT_ArgReg[int_args]->as_VMReg());
 685         int_args++;
 686       } else {
 687         return -1;
 688       }
 689       break;
 690     case T_FLOAT:
 691       if (fp_args < Argument::n_float_register_parameters_j) {
 692         regs[i].set1(FP_ArgReg[fp_args]->as_VMReg());
 693         fp_args++;
 694       } else {
 695         return -1;
 696       }
 697       break;
 698     case T_DOUBLE:
 699       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 700       if (fp_args < Argument::n_float_register_parameters_j) {
 701         regs[i].set2(FP_ArgReg[fp_args]->as_VMReg());
 702         fp_args++;
 703       } else {
 704         return -1;
 705       }
 706       break;
 707     default:
 708       ShouldNotReachHere();
 709       break;
 710     }
 711   }
 712 
 713   return int_args + fp_args;
 714 }
 715 
 716 // Patch the callers callsite with entry to compiled code if it exists.
 717 static void patch_callers_callsite(MacroAssembler *masm) {
 718   Label L;
 719   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 720   __ jcc(Assembler::equal, L);
 721 
 722   // Save the current stack pointer
 723   __ mov(r13, rsp);
 724   // Schedule the branch target address early.
 725   // Call into the VM to patch the caller, then jump to compiled callee
 726   // rax isn't live so capture return address while we easily can
 727   __ movptr(rax, Address(rsp, 0));
 728 
 729   // align stack so push_CPU_state doesn't fault
 730   __ andptr(rsp, -(StackAlignmentInBytes));
 731   __ push_CPU_state();
 732   __ vzeroupper();
 733   // VM needs caller's callsite
 734   // VM needs target method
 735   // This needs to be a long call since we will relocate this adapter to
 736   // the codeBuffer and it may not reach
 737 
 738   // Allocate argument register save area
 739   if (frame::arg_reg_save_area_bytes != 0) {
 740     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 741   }
 742   __ mov(c_rarg0, rbx);
 743   __ mov(c_rarg1, rax);
 744   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 745 
 746   // De-allocate argument register save area
 747   if (frame::arg_reg_save_area_bytes != 0) {
 748     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 749   }
 750 
 751   __ vzeroupper();
 752   __ pop_CPU_state();
 753   // restore sp
 754   __ mov(rsp, r13);
 755   __ bind(L);
 756 }
 757 
 758 // For each inline type argument, sig includes the list of fields of
 759 // the inline type. This utility function computes the number of
 760 // arguments for the call if inline types are passed by reference (the
 761 // calling convention the interpreter expects).
 762 static int compute_total_args_passed_int(const GrowableArray<SigEntry>* sig_extended) {
 763   int total_args_passed = 0;
 764   if (InlineTypePassFieldsAsArgs) {
 765     for (int i = 0; i < sig_extended->length(); i++) {
 766       BasicType bt = sig_extended->at(i)._bt;
 767       if (bt == T_METADATA) {
 768         // In sig_extended, an inline type argument starts with:
 769         // T_METADATA, followed by the types of the fields of the
 770         // inline type and T_VOID to mark the end of the value
 771         // type. Inline types are flattened so, for instance, in the
 772         // case of an inline type with an int field and an inline type
 773         // field that itself has 2 fields, an int and a long:
 774         // T_METADATA T_INT T_METADATA T_INT T_LONG T_VOID (second
 775         // slot for the T_LONG) T_VOID (inner inline type) T_VOID
 776         // (outer inline type)
 777         total_args_passed++;
 778         int vt = 1;
 779         do {
 780           i++;
 781           BasicType bt = sig_extended->at(i)._bt;
 782           BasicType prev_bt = sig_extended->at(i-1)._bt;
 783           if (bt == T_METADATA) {
 784             vt++;
 785           } else if (bt == T_VOID &&
 786                      prev_bt != T_LONG &&
 787                      prev_bt != T_DOUBLE) {
 788             vt--;
 789           }
 790         } while (vt != 0);
 791       } else {
 792         total_args_passed++;
 793       }
 794     }
 795   } else {
 796     total_args_passed = sig_extended->length();
 797   }
 798   return total_args_passed;
 799 }
 800 
 801 
 802 static void gen_c2i_adapter_helper(MacroAssembler* masm,
 803                                    BasicType bt,
 804                                    BasicType prev_bt,
 805                                    size_t size_in_bytes,
 806                                    const VMRegPair& reg_pair,
 807                                    const Address& to,
 808                                    int extraspace,
 809                                    bool is_oop) {
 810   if (bt == T_VOID) {
 811     assert(prev_bt == T_LONG || prev_bt == T_DOUBLE, "missing half");
 812     return;
 813   }
 814 
 815   // Say 4 args:
 816   // i   st_off
 817   // 0   32 T_LONG
 818   // 1   24 T_VOID
 819   // 2   16 T_OBJECT
 820   // 3    8 T_BOOL
 821   // -    0 return address
 822   //
 823   // However to make thing extra confusing. Because we can fit a long/double in
 824   // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 825   // leaves one slot empty and only stores to a single slot. In this case the
 826   // slot that is occupied is the T_VOID slot. See I said it was confusing.
 827 
 828   bool wide = (size_in_bytes == wordSize);
 829   VMReg r_1 = reg_pair.first();
 830   VMReg r_2 = reg_pair.second();
 831   assert(r_2->is_valid() == wide, "invalid size");
 832   if (!r_1->is_valid()) {
 833     assert(!r_2->is_valid(), "must be invalid");
 834     return;
 835   }
 836 
 837   if (!r_1->is_XMMRegister()) {
 838     Register val = rax;
 839     if (r_1->is_stack()) {
 840       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 841       __ load_sized_value(val, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false);
 842     } else {
 843       val = r_1->as_Register();
 844     }
 845     assert_different_registers(to.base(), val, rscratch1);
 846     if (is_oop) {
 847       __ push(r13);
 848       __ push(rbx);
 849       // store_heap_oop transitively calls oop_store_at which corrupts to.base(). We need to keep it valid.
 850       __ push(to.base());
 851       __ store_heap_oop(to, val, rscratch1, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
 852       __ pop(to.base());
 853       __ pop(rbx);
 854       __ pop(r13);
 855     } else {
 856       __ store_sized_value(to, val, size_in_bytes);
 857     }
 858   } else {
 859     if (wide) {
 860       __ movdbl(to, r_1->as_XMMRegister());
 861     } else {
 862       __ movflt(to, r_1->as_XMMRegister());
 863     }
 864   }
 865 }
 866 
 867 static void gen_c2i_adapter(MacroAssembler *masm,
 868                             const GrowableArray<SigEntry>* sig_extended,
 869                             const VMRegPair *regs,
 870                             bool requires_clinit_barrier,
 871                             address& c2i_no_clinit_check_entry,
 872                             Label& skip_fixup,
 873                             address start,
 874                             OopMapSet* oop_maps,
 875                             int& frame_complete,
 876                             int& frame_size_in_words,
 877                             bool alloc_inline_receiver) {
 878   if (requires_clinit_barrier) {
 879     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
 880     Label L_skip_barrier;
 881     Register method = rbx;
 882 
 883     { // Bypass the barrier for non-static methods
 884       Register flags = rscratch1;
 885       __ load_unsigned_short(flags, Address(method, Method::access_flags_offset()));
 886       __ testl(flags, JVM_ACC_STATIC);
 887       __ jcc(Assembler::zero, L_skip_barrier); // non-static
 888     }
 889 
 890     Register klass = rscratch1;
 891     __ load_method_holder(klass, method);
 892     __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
 893 
 894     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
 895 
 896     __ bind(L_skip_barrier);
 897     c2i_no_clinit_check_entry = __ pc();
 898   }
 899 
 900   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 901   bs->c2i_entry_barrier(masm);
 902 
 903   // Before we get into the guts of the C2I adapter, see if we should be here
 904   // at all.  We've come from compiled code and are attempting to jump to the
 905   // interpreter, which means the caller made a static call to get here
 906   // (vcalls always get a compiled target if there is one).  Check for a
 907   // compiled target.  If there is one, we need to patch the caller's call.
 908   patch_callers_callsite(masm);
 909 
 910   __ bind(skip_fixup);
 911 
 912   if (InlineTypePassFieldsAsArgs) {
 913     // Is there an inline type argument?
 914     bool has_inline_argument = false;
 915     for (int i = 0; i < sig_extended->length() && !has_inline_argument; i++) {
 916       has_inline_argument = (sig_extended->at(i)._bt == T_METADATA);
 917     }
 918     if (has_inline_argument) {
 919       // There is at least a value type argument: we're coming from
 920       // compiled code so we may not have buffers to back the value
 921       // objects. Allocate the buffers here with a runtime call for
 922       // the value arguments that needs a buffer.
 923       OopMap* map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
 924 
 925       frame_complete = __ offset();
 926 
 927       __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
 928 
 929       __ mov(c_rarg0, r15_thread);
 930       __ mov(c_rarg1, rbx);
 931       __ mov64(c_rarg2, (int64_t)alloc_inline_receiver);
 932       __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::allocate_inline_types)));
 933 
 934       oop_maps->add_gc_map((int)(__ pc() - start), map);
 935       __ reset_last_Java_frame(false);
 936 
 937       RegisterSaver::restore_live_registers(masm);
 938 
 939       Label no_exception;
 940       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
 941       __ jcc(Assembler::equal, no_exception);
 942 
 943       __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
 944       __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 945       __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
 946 
 947       __ bind(no_exception);
 948 
 949       // We get an array of objects from the runtime call
 950       __ get_vm_result_oop(rscratch2); // Use rscratch2 (r11) as temporary because rscratch1 (r10) is trashed by movptr()
 951     }
 952   }
 953 
 954   // Since all args are passed on the stack, total_args_passed *
 955   // Interpreter::stackElementSize is the space we need.
 956   int total_args_passed = compute_total_args_passed_int(sig_extended);
 957   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 958 
 959   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 960 
 961   // stack is aligned, keep it that way
 962   // This is not currently needed or enforced by the interpreter, but
 963   // we might as well conform to the ABI.
 964   extraspace = align_up(extraspace, 2*wordSize);
 965 
 966   // set senderSP value
 967   __ lea(r13, Address(rsp, wordSize));
 968 
 969 #ifdef ASSERT
 970   __ check_stack_alignment(r13, "sender stack not aligned");
 971 #endif
 972   if (extraspace > 0) {
 973     // Pop the return address
 974     __ pop(rax);
 975 
 976     __ subptr(rsp, extraspace);
 977 
 978     // Push the return address
 979     __ push(rax);
 980 
 981     // Account for the return address location since we store it first rather
 982     // than hold it in a register across all the shuffling
 983     extraspace += wordSize;
 984   }
 985 
 986 #ifdef ASSERT
 987   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 988 #endif
 989 
 990   // Now write the args into the outgoing interpreter space
 991 
 992   // next_arg_comp is the next argument from the compiler point of
 993   // view (inline type fields are passed in registers/on the stack). In
 994   // sig_extended, an inline type argument starts with: T_METADATA,
 995   // followed by the types of the fields of the inline type and T_VOID
 996   // to mark the end of the inline type. ignored counts the number of
 997   // T_METADATA/T_VOID. next_vt_arg is the next inline type argument:
 998   // used to get the buffer for that argument from the pool of buffers
 999   // we allocated above and want to pass to the
1000   // interpreter. next_arg_int is the next argument from the
1001   // interpreter point of view (inline types are passed by reference).
1002   for (int next_arg_comp = 0, ignored = 0, next_vt_arg = 0, next_arg_int = 0;
1003        next_arg_comp < sig_extended->length(); next_arg_comp++) {
1004     assert(ignored <= next_arg_comp, "shouldn't skip over more slots than there are arguments");
1005     assert(next_arg_int <= total_args_passed, "more arguments for the interpreter than expected?");
1006     BasicType bt = sig_extended->at(next_arg_comp)._bt;
1007     int st_off = (total_args_passed - next_arg_int) * Interpreter::stackElementSize;
1008     if (!InlineTypePassFieldsAsArgs || bt != T_METADATA) {
1009       int next_off = st_off - Interpreter::stackElementSize;
1010       const int offset = (bt == T_LONG || bt == T_DOUBLE) ? next_off : st_off;
1011       const VMRegPair reg_pair = regs[next_arg_comp-ignored];
1012       size_t size_in_bytes = reg_pair.second()->is_valid() ? 8 : 4;
1013       gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
1014                              size_in_bytes, reg_pair, Address(rsp, offset), extraspace, false);
1015       next_arg_int++;
1016 #ifdef ASSERT
1017       if (bt == T_LONG || bt == T_DOUBLE) {
1018         // Overwrite the unused slot with known junk
1019         __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
1020         __ movptr(Address(rsp, st_off), rax);
1021       }
1022 #endif /* ASSERT */
1023     } else {
1024       ignored++;
1025       next_arg_int++;
1026       int vt = 1;
1027       // write fields we get from compiled code in registers/stack
1028       // slots to the buffer: we know we are done with that inline type
1029       // argument when we hit the T_VOID that acts as an end of inline
1030       // type delimiter for this inline type. Inline types are flattened
1031       // so we might encounter embedded inline types. Each entry in
1032       // sig_extended contains a field offset in the buffer.
1033       Label L_null;
1034       Label not_null_buffer;
1035       do {
1036         next_arg_comp++;
1037         BasicType bt = sig_extended->at(next_arg_comp)._bt;
1038         BasicType prev_bt = sig_extended->at(next_arg_comp-1)._bt;
1039         if (bt == T_METADATA) {
1040           vt++;
1041           ignored++;
1042         } else if (bt == T_VOID &&
1043                    prev_bt != T_LONG &&
1044                    prev_bt != T_DOUBLE) {
1045           vt--;
1046           ignored++;
1047         } else if (sig_extended->at(next_arg_comp)._vt_oop) {
1048           // buffer argument: use if non null
1049           VMReg buffer = regs[next_arg_comp-ignored].first();
1050           if (buffer->is_stack()) {
1051             int ld_off = buffer->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
1052             __ movptr(r14, Address(rsp, ld_off));
1053           } else {
1054             __ movptr(r14, buffer->as_Register());
1055           }
1056           __ testptr(r14, r14);
1057           __ jcc(Assembler::notEqual, not_null_buffer);
1058           // otherwise get the buffer from the just allocated pool of buffers
1059           int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + next_vt_arg * type2aelembytes(T_OBJECT);
1060           __ load_heap_oop(r14, Address(rscratch2, index));
1061           next_vt_arg++;
1062         } else {
1063           int off = sig_extended->at(next_arg_comp)._offset;
1064           if (off == -1) {
1065             // Nullable inline type argument, emit null check
1066             VMReg reg = regs[next_arg_comp-ignored].first();
1067             Label L_notNull;
1068             if (reg->is_stack()) {
1069               int ld_off = reg->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
1070               __ testb(Address(rsp, ld_off), 1);
1071             } else {
1072               __ testb(reg->as_Register(), 1);
1073             }
1074             __ jcc(Assembler::notZero, L_notNull);
1075             __ movptr(Address(rsp, st_off), 0);
1076             __ jmp(L_null);
1077             __ bind(L_notNull);
1078             continue;
1079           }
1080           assert(off > 0, "offset in object should be positive");
1081           size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize;
1082           bool is_oop = is_reference_type(bt);
1083           gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
1084                                  size_in_bytes, regs[next_arg_comp-ignored], Address(r14, off), extraspace, is_oop);
1085         }
1086       } while (vt != 0);
1087       // pass the buffer to the interpreter
1088       __ bind(not_null_buffer);
1089       __ movptr(Address(rsp, st_off), r14);
1090       __ bind(L_null);
1091     }
1092   }
1093 
1094   // Schedule the branch target address early.
1095   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
1096   __ jmp(rcx);
1097 }
1098 
1099 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
1100                                     int comp_args_on_stack,
1101                                     const GrowableArray<SigEntry>* sig,
1102                                     const VMRegPair *regs) {
1103 
1104   // Note: r13 contains the senderSP on entry. We must preserve it since
1105   // we may do a i2c -> c2i transition if we lose a race where compiled
1106   // code goes non-entrant while we get args ready.
1107   // In addition we use r13 to locate all the interpreter args as
1108   // we must align the stack to 16 bytes on an i2c entry else we
1109   // lose alignment we expect in all compiled code and register
1110   // save code can segv when fxsave instructions find improperly
1111   // aligned stack pointer.
1112 
1113   // Adapters can be frameless because they do not require the caller
1114   // to perform additional cleanup work, such as correcting the stack pointer.
1115   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
1116   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
1117   // even if a callee has modified the stack pointer.
1118   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
1119   // routinely repairs its caller's stack pointer (from sender_sp, which is set
1120   // up via the senderSP register).
1121   // In other words, if *either* the caller or callee is interpreted, we can
1122   // get the stack pointer repaired after a call.
1123   // This is why c2i and i2c adapters cannot be indefinitely composed.
1124   // In particular, if a c2i adapter were to somehow call an i2c adapter,
1125   // both caller and callee would be compiled methods, and neither would
1126   // clean up the stack pointer changes performed by the two adapters.
1127   // If this happens, control eventually transfers back to the compiled
1128   // caller, but with an uncorrected stack, causing delayed havoc.
1129 
1130   // Must preserve original SP for loading incoming arguments because
1131   // we need to align the outgoing SP for compiled code.
1132   __ movptr(r11, rsp);
1133 
1134   // Pick up the return address
1135   __ pop(rax);
1136 
1137   // Convert 4-byte c2 stack slots to words.
1138   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
1139 
1140   if (comp_args_on_stack) {
1141     __ subptr(rsp, comp_words_on_stack * wordSize);
1142   }
1143 
1144   // Ensure compiled code always sees stack at proper alignment
1145   __ andptr(rsp, -16);
1146 
1147   // push the return address and misalign the stack that youngest frame always sees
1148   // as far as the placement of the call instruction
1149   __ push(rax);
1150 
1151   // Put saved SP in another register
1152   const Register saved_sp = rax;
1153   __ movptr(saved_sp, r11);
1154 
1155   // Will jump to the compiled code just as if compiled code was doing it.
1156   // Pre-load the register-jump target early, to schedule it better.
1157   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_inline_offset())));
1158 
1159   int total_args_passed = sig->length();
1160 
1161   // Now generate the shuffle code.  Pick up all register args and move the
1162   // rest through the floating point stack top.
1163   for (int i = 0; i < total_args_passed; i++) {
1164     BasicType bt = sig->at(i)._bt;
1165     if (bt == T_VOID) {
1166       // Longs and doubles are passed in native word order, but misaligned
1167       // in the 32-bit build.
1168       BasicType prev_bt = (i > 0) ? sig->at(i-1)._bt : T_ILLEGAL;
1169       assert(i > 0 && (prev_bt == T_LONG || prev_bt == T_DOUBLE), "missing half");
1170       continue;
1171     }
1172 
1173     // Pick up 0, 1 or 2 words from SP+offset.
1174 
1175     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
1176             "scrambled load targets?");
1177     // Load in argument order going down.
1178     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
1179     // Point to interpreter value (vs. tag)
1180     int next_off = ld_off - Interpreter::stackElementSize;
1181     //
1182     //
1183     //
1184     VMReg r_1 = regs[i].first();
1185     VMReg r_2 = regs[i].second();
1186     if (!r_1->is_valid()) {
1187       assert(!r_2->is_valid(), "");
1188       continue;
1189     }
1190     if (r_1->is_stack()) {
1191       // Convert stack slot to an SP offset (+ wordSize to account for return address )
1192       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
1193 
1194       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
1195       // and if we end up going thru a c2i because of a miss a reasonable value of r13
1196       // will be generated.
1197       if (!r_2->is_valid()) {
1198         // sign extend???
1199         __ movl(r13, Address(saved_sp, ld_off));
1200         __ movptr(Address(rsp, st_off), r13);
1201       } else {
1202         //
1203         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1204         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1205         // So we must adjust where to pick up the data to match the interpreter.
1206         //
1207         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
1208         // are accessed as negative so LSW is at LOW address
1209 
1210         // ld_off is MSW so get LSW
1211         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1212                            next_off : ld_off;
1213         __ movq(r13, Address(saved_sp, offset));
1214         // st_off is LSW (i.e. reg.first())
1215         __ movq(Address(rsp, st_off), r13);
1216       }
1217     } else if (r_1->is_Register()) {  // Register argument
1218       Register r = r_1->as_Register();
1219       assert(r != rax, "must be different");
1220       if (r_2->is_valid()) {
1221         //
1222         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1223         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1224         // So we must adjust where to pick up the data to match the interpreter.
1225 
1226         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1227                            next_off : ld_off;
1228 
1229         // this can be a misaligned move
1230         __ movq(r, Address(saved_sp, offset));
1231       } else {
1232         // sign extend and use a full word?
1233         __ movl(r, Address(saved_sp, ld_off));
1234       }
1235     } else {
1236       if (!r_2->is_valid()) {
1237         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
1238       } else {
1239         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
1240       }
1241     }
1242   }
1243 
1244   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
1245 
1246   // 6243940 We might end up in handle_wrong_method if
1247   // the callee is deoptimized as we race thru here. If that
1248   // happens we don't want to take a safepoint because the
1249   // caller frame will look interpreted and arguments are now
1250   // "compiled" so it is much better to make this transition
1251   // invisible to the stack walking code. Unfortunately if
1252   // we try and find the callee by normal means a safepoint
1253   // is possible. So we stash the desired callee in the thread
1254   // and the vm will find there should this case occur.
1255 
1256   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1257 
1258   // put Method* where a c2i would expect should we end up there
1259   // only needed because of c2 resolve stubs return Method* as a result in
1260   // rax
1261   __ mov(rax, rbx);
1262   __ jmp(r11);
1263 }
1264 
1265 static void gen_inline_cache_check(MacroAssembler *masm, Label& skip_fixup) {
1266   Register data = rax;
1267   __ ic_check(1 /* end_alignment */);
1268   __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1269 
1270   // Method might have been compiled since the call site was patched to
1271   // interpreted if that is the case treat it as a miss so we can get
1272   // the call site corrected.
1273   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1274   __ jcc(Assembler::equal, skip_fixup);
1275   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1276 }
1277 
1278 // ---------------------------------------------------------------
1279 void SharedRuntime::generate_i2c2i_adapters(MacroAssembler* masm,
1280                                             int comp_args_on_stack,
1281                                             const GrowableArray<SigEntry>* sig,
1282                                             const VMRegPair* regs,
1283                                             const GrowableArray<SigEntry>* sig_cc,
1284                                             const VMRegPair* regs_cc,
1285                                             const GrowableArray<SigEntry>* sig_cc_ro,
1286                                             const VMRegPair* regs_cc_ro,
1287                                             address entry_address[AdapterBlob::ENTRY_COUNT],
1288                                             AdapterBlob*& new_adapter,
1289                                             bool allocate_code_blob) {
1290   entry_address[AdapterBlob::I2C] = __ pc();
1291   gen_i2c_adapter(masm, comp_args_on_stack, sig, regs);
1292 
1293   // -------------------------------------------------------------------------
1294   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1295   // to the interpreter.  The args start out packed in the compiled layout.  They
1296   // need to be unpacked into the interpreter layout.  This will almost always
1297   // require some stack space.  We grow the current (compiled) stack, then repack
1298   // the args.  We  finally end in a jump to the generic interpreter entry point.
1299   // On exit from the interpreter, the interpreter will restore our SP (lest the
1300   // compiled code, which relies solely on SP and not RBP, get sick).
1301 
1302   entry_address[AdapterBlob::C2I_Unverified] = __ pc();
1303   entry_address[AdapterBlob::C2I_Unverified_Inline] = __ pc();
1304   Label skip_fixup;
1305 
1306   gen_inline_cache_check(masm, skip_fixup);
1307 
1308   OopMapSet* oop_maps = new OopMapSet();
1309   int frame_complete = CodeOffsets::frame_never_safe;
1310   int frame_size_in_words = 0;
1311 
1312   // Scalarized c2i adapter with non-scalarized receiver (i.e., don't pack receiver)
1313   entry_address[AdapterBlob::C2I_No_Clinit_Check] = nullptr;
1314   entry_address[AdapterBlob::C2I_Inline_RO] = __ pc();
1315   if (regs_cc != regs_cc_ro) {
1316     // No class init barrier needed because method is guaranteed to be non-static
1317     gen_c2i_adapter(masm, sig_cc_ro, regs_cc_ro, /* requires_clinit_barrier = */ false, entry_address[AdapterBlob::C2I_No_Clinit_Check],
1318                     skip_fixup, entry_address[AdapterBlob::I2C], oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1319     skip_fixup.reset();
1320   }
1321 
1322   // Scalarized c2i adapter
1323   entry_address[AdapterBlob::C2I]        = __ pc();
1324   entry_address[AdapterBlob::C2I_Inline] = __ pc();
1325   gen_c2i_adapter(masm, sig_cc, regs_cc, /* requires_clinit_barrier = */ true, entry_address[AdapterBlob::C2I_No_Clinit_Check],
1326                   skip_fixup, entry_address[AdapterBlob::I2C], oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ true);
1327 
1328   // Non-scalarized c2i adapter
1329   if (regs != regs_cc) {
1330     entry_address[AdapterBlob::C2I_Unverified_Inline] = __ pc();
1331     Label inline_entry_skip_fixup;
1332     gen_inline_cache_check(masm, inline_entry_skip_fixup);
1333 
1334     entry_address[AdapterBlob::C2I_Inline] = __ pc();
1335     gen_c2i_adapter(masm, sig, regs, /* requires_clinit_barrier = */ true, entry_address[AdapterBlob::C2I_No_Clinit_Check],
1336                     inline_entry_skip_fixup, entry_address[AdapterBlob::I2C], oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1337   }
1338 
1339   // The c2i adapters might safepoint and trigger a GC. The caller must make sure that
1340   // the GC knows about the location of oop argument locations passed to the c2i adapter.
1341   if (allocate_code_blob) {
1342     bool caller_must_gc_arguments = (regs != regs_cc);
1343     int entry_offset[AdapterHandlerEntry::ENTRIES_COUNT];
1344     assert(AdapterHandlerEntry::ENTRIES_COUNT == 7, "sanity");
1345     AdapterHandlerLibrary::address_to_offset(entry_address, entry_offset);
1346     new_adapter = AdapterBlob::create(masm->code(), entry_offset, frame_complete, frame_size_in_words, oop_maps, caller_must_gc_arguments);
1347   }
1348 }
1349 
1350 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1351                                          VMRegPair *regs,
1352                                          int total_args_passed) {
1353 
1354 // We return the amount of VMRegImpl stack slots we need to reserve for all
1355 // the arguments NOT counting out_preserve_stack_slots.
1356 
1357 // NOTE: These arrays will have to change when c1 is ported
1358 #ifdef _WIN64
1359     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1360       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1361     };
1362     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1363       c_farg0, c_farg1, c_farg2, c_farg3
1364     };
1365 #else
1366     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1367       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1368     };
1369     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1370       c_farg0, c_farg1, c_farg2, c_farg3,
1371       c_farg4, c_farg5, c_farg6, c_farg7
1372     };
1373 #endif // _WIN64
1374 
1375 
1376     uint int_args = 0;
1377     uint fp_args = 0;
1378     uint stk_args = 0; // inc by 2 each time
1379 
1380     for (int i = 0; i < total_args_passed; i++) {
1381       switch (sig_bt[i]) {
1382       case T_BOOLEAN:
1383       case T_CHAR:
1384       case T_BYTE:
1385       case T_SHORT:
1386       case T_INT:
1387         if (int_args < Argument::n_int_register_parameters_c) {
1388           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1389 #ifdef _WIN64
1390           fp_args++;
1391           // Allocate slots for callee to stuff register args the stack.
1392           stk_args += 2;
1393 #endif
1394         } else {
1395           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1396           stk_args += 2;
1397         }
1398         break;
1399       case T_LONG:
1400         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1401         // fall through
1402       case T_OBJECT:
1403       case T_ARRAY:
1404       case T_ADDRESS:
1405       case T_METADATA:
1406         if (int_args < Argument::n_int_register_parameters_c) {
1407           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1408 #ifdef _WIN64
1409           fp_args++;
1410           stk_args += 2;
1411 #endif
1412         } else {
1413           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1414           stk_args += 2;
1415         }
1416         break;
1417       case T_FLOAT:
1418         if (fp_args < Argument::n_float_register_parameters_c) {
1419           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1420 #ifdef _WIN64
1421           int_args++;
1422           // Allocate slots for callee to stuff register args the stack.
1423           stk_args += 2;
1424 #endif
1425         } else {
1426           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1427           stk_args += 2;
1428         }
1429         break;
1430       case T_DOUBLE:
1431         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1432         if (fp_args < Argument::n_float_register_parameters_c) {
1433           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1434 #ifdef _WIN64
1435           int_args++;
1436           // Allocate slots for callee to stuff register args the stack.
1437           stk_args += 2;
1438 #endif
1439         } else {
1440           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1441           stk_args += 2;
1442         }
1443         break;
1444       case T_VOID: // Halves of longs and doubles
1445         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1446         regs[i].set_bad();
1447         break;
1448       default:
1449         ShouldNotReachHere();
1450         break;
1451       }
1452     }
1453 #ifdef _WIN64
1454   // windows abi requires that we always allocate enough stack space
1455   // for 4 64bit registers to be stored down.
1456   if (stk_args < 8) {
1457     stk_args = 8;
1458   }
1459 #endif // _WIN64
1460 
1461   return stk_args;
1462 }
1463 
1464 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1465                                              uint num_bits,
1466                                              uint total_args_passed) {
1467   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1468          "only certain vector sizes are supported for now");
1469 
1470   static const XMMRegister VEC_ArgReg[32] = {
1471      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1472      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1473     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1474     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1475   };
1476 
1477   uint stk_args = 0;
1478   uint fp_args = 0;
1479 
1480   for (uint i = 0; i < total_args_passed; i++) {
1481     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1482     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1483     regs[i].set_pair(vmreg->next(next_val), vmreg);
1484   }
1485 
1486   return stk_args;
1487 }
1488 
1489 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1490   // We always ignore the frame_slots arg and just use the space just below frame pointer
1491   // which by this time is free to use
1492   switch (ret_type) {
1493   case T_FLOAT:
1494     __ movflt(Address(rbp, -wordSize), xmm0);
1495     break;
1496   case T_DOUBLE:
1497     __ movdbl(Address(rbp, -wordSize), xmm0);
1498     break;
1499   case T_VOID:  break;
1500   default: {
1501     __ movptr(Address(rbp, -wordSize), rax);
1502     }
1503   }
1504 }
1505 
1506 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1507   // We always ignore the frame_slots arg and just use the space just below frame pointer
1508   // which by this time is free to use
1509   switch (ret_type) {
1510   case T_FLOAT:
1511     __ movflt(xmm0, Address(rbp, -wordSize));
1512     break;
1513   case T_DOUBLE:
1514     __ movdbl(xmm0, Address(rbp, -wordSize));
1515     break;
1516   case T_VOID:  break;
1517   default: {
1518     __ movptr(rax, Address(rbp, -wordSize));
1519     }
1520   }
1521 }
1522 
1523 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1524     for ( int i = first_arg ; i < arg_count ; i++ ) {
1525       if (args[i].first()->is_Register()) {
1526         __ push(args[i].first()->as_Register());
1527       } else if (args[i].first()->is_XMMRegister()) {
1528         __ subptr(rsp, 2*wordSize);
1529         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1530       }
1531     }
1532 }
1533 
1534 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1535     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1536       if (args[i].first()->is_Register()) {
1537         __ pop(args[i].first()->as_Register());
1538       } else if (args[i].first()->is_XMMRegister()) {
1539         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1540         __ addptr(rsp, 2*wordSize);
1541       }
1542     }
1543 }
1544 
1545 static void verify_oop_args(MacroAssembler* masm,
1546                             const methodHandle& method,
1547                             const BasicType* sig_bt,
1548                             const VMRegPair* regs) {
1549   Register temp_reg = rbx;  // not part of any compiled calling seq
1550   if (VerifyOops) {
1551     for (int i = 0; i < method->size_of_parameters(); i++) {
1552       if (is_reference_type(sig_bt[i])) {
1553         VMReg r = regs[i].first();
1554         assert(r->is_valid(), "bad oop arg");
1555         if (r->is_stack()) {
1556           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1557           __ verify_oop(temp_reg);
1558         } else {
1559           __ verify_oop(r->as_Register());
1560         }
1561       }
1562     }
1563   }
1564 }
1565 
1566 static void check_continuation_enter_argument(VMReg actual_vmreg,
1567                                               Register expected_reg,
1568                                               const char* name) {
1569   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1570   assert(actual_vmreg->as_Register() == expected_reg,
1571          "%s is in unexpected register: %s instead of %s",
1572          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1573 }
1574 
1575 
1576 //---------------------------- continuation_enter_setup ---------------------------
1577 //
1578 // Arguments:
1579 //   None.
1580 //
1581 // Results:
1582 //   rsp: pointer to blank ContinuationEntry
1583 //
1584 // Kills:
1585 //   rax
1586 //
1587 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1588   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1589   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1590   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1591 
1592   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1593   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1594 
1595   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1596   OopMap* map = new OopMap(frame_size, 0);
1597 
1598   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1599   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1600   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1601 
1602   return map;
1603 }
1604 
1605 //---------------------------- fill_continuation_entry ---------------------------
1606 //
1607 // Arguments:
1608 //   rsp: pointer to blank Continuation entry
1609 //   reg_cont_obj: pointer to the continuation
1610 //   reg_flags: flags
1611 //
1612 // Results:
1613 //   rsp: pointer to filled out ContinuationEntry
1614 //
1615 // Kills:
1616 //   rax
1617 //
1618 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1619   assert_different_registers(rax, reg_cont_obj, reg_flags);
1620 #ifdef ASSERT
1621   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1622 #endif
1623   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1624   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1625   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1626   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1627   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1628 
1629   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1630   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1631 
1632   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1633 }
1634 
1635 //---------------------------- continuation_enter_cleanup ---------------------------
1636 //
1637 // Arguments:
1638 //   rsp: pointer to the ContinuationEntry
1639 //
1640 // Results:
1641 //   rsp: pointer to the spilled rbp in the entry frame
1642 //
1643 // Kills:
1644 //   rbx
1645 //
1646 static void continuation_enter_cleanup(MacroAssembler* masm) {
1647 #ifdef ASSERT
1648   Label L_good_sp;
1649   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1650   __ jcc(Assembler::equal, L_good_sp);
1651   __ stop("Incorrect rsp at continuation_enter_cleanup");
1652   __ bind(L_good_sp);
1653 #endif
1654   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1655   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1656   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1657   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1658   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1659 }
1660 
1661 static void gen_continuation_enter(MacroAssembler* masm,
1662                                    const VMRegPair* regs,
1663                                    int& exception_offset,
1664                                    OopMapSet* oop_maps,
1665                                    int& frame_complete,
1666                                    int& stack_slots,
1667                                    int& interpreted_entry_offset,
1668                                    int& compiled_entry_offset) {
1669 
1670   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1671   int pos_cont_obj   = 0;
1672   int pos_is_cont    = 1;
1673   int pos_is_virtual = 2;
1674 
1675   // The platform-specific calling convention may present the arguments in various registers.
1676   // To simplify the rest of the code, we expect the arguments to reside at these known
1677   // registers, and we additionally check the placement here in case calling convention ever
1678   // changes.
1679   Register reg_cont_obj   = c_rarg1;
1680   Register reg_is_cont    = c_rarg2;
1681   Register reg_is_virtual = c_rarg3;
1682 
1683   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1684   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1685   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1686 
1687   // Utility methods kill rax, make sure there are no collisions
1688   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1689 
1690   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1691                          relocInfo::static_call_type);
1692 
1693   address start = __ pc();
1694 
1695   Label L_thaw, L_exit;
1696 
1697   // i2i entry used at interp_only_mode only
1698   interpreted_entry_offset = __ pc() - start;
1699   {
1700 #ifdef ASSERT
1701     Label is_interp_only;
1702     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1703     __ jcc(Assembler::notEqual, is_interp_only);
1704     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1705     __ bind(is_interp_only);
1706 #endif
1707 
1708     __ pop(rax); // return address
1709     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1710     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1711     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1712     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1713     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1714     __ push(rax); // return address
1715     __ push_cont_fastpath();
1716 
1717     __ enter();
1718 
1719     stack_slots = 2; // will be adjusted in setup
1720     OopMap* map = continuation_enter_setup(masm, stack_slots);
1721     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1722     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1723 
1724     __ verify_oop(reg_cont_obj);
1725 
1726     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1727 
1728     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1729     __ testptr(reg_is_cont, reg_is_cont);
1730     __ jcc(Assembler::notZero, L_thaw);
1731 
1732     // --- Resolve path
1733 
1734     // Make sure the call is patchable
1735     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1736     // Emit stub for static call
1737     address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1738     if (stub == nullptr) {
1739       fatal("CodeCache is full at gen_continuation_enter");
1740     }
1741     __ call(resolve);
1742     oop_maps->add_gc_map(__ pc() - start, map);
1743     __ post_call_nop();
1744 
1745     __ jmp(L_exit);
1746   }
1747 
1748   // compiled entry
1749   __ align(CodeEntryAlignment);
1750   compiled_entry_offset = __ pc() - start;
1751   __ enter();
1752 
1753   stack_slots = 2; // will be adjusted in setup
1754   OopMap* map = continuation_enter_setup(masm, stack_slots);
1755 
1756   // Frame is now completed as far as size and linkage.
1757   frame_complete = __ pc() - start;
1758 
1759   __ verify_oop(reg_cont_obj);
1760 
1761   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1762 
1763   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1764   __ testptr(reg_is_cont, reg_is_cont);
1765   __ jccb(Assembler::notZero, L_thaw);
1766 
1767   // --- call Continuation.enter(Continuation c, boolean isContinue)
1768 
1769   // Make sure the call is patchable
1770   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1771 
1772   // Emit stub for static call
1773   address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1774   if (stub == nullptr) {
1775     fatal("CodeCache is full at gen_continuation_enter");
1776   }
1777 
1778   // The call needs to be resolved. There's a special case for this in
1779   // SharedRuntime::find_callee_info_helper() which calls
1780   // LinkResolver::resolve_continuation_enter() which resolves the call to
1781   // Continuation.enter(Continuation c, boolean isContinue).
1782   __ call(resolve);
1783 
1784   oop_maps->add_gc_map(__ pc() - start, map);
1785   __ post_call_nop();
1786 
1787   __ jmpb(L_exit);
1788 
1789   // --- Thawing path
1790 
1791   __ bind(L_thaw);
1792 
1793   ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1794   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1795 
1796   ContinuationEntry::_return_pc_offset = __ pc() - start;
1797   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1798   __ post_call_nop();
1799 
1800   // --- Normal exit (resolve/thawing)
1801 
1802   __ bind(L_exit);
1803   ContinuationEntry::_cleanup_offset = __ pc() - start;
1804   continuation_enter_cleanup(masm);
1805   __ pop(rbp);
1806   __ ret(0);
1807 
1808   // --- Exception handling path
1809 
1810   exception_offset = __ pc() - start;
1811 
1812   continuation_enter_cleanup(masm);
1813   __ pop(rbp);
1814 
1815   __ movptr(c_rarg0, r15_thread);
1816   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1817 
1818   // rax still holds the original exception oop, save it before the call
1819   __ push(rax);
1820 
1821   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1822   __ movptr(rbx, rax);
1823 
1824   // Continue at exception handler:
1825   //   rax: exception oop
1826   //   rbx: exception handler
1827   //   rdx: exception pc
1828   __ pop(rax);
1829   __ verify_oop(rax);
1830   __ pop(rdx);
1831   __ jmp(rbx);
1832 }
1833 
1834 static void gen_continuation_yield(MacroAssembler* masm,
1835                                    const VMRegPair* regs,
1836                                    OopMapSet* oop_maps,
1837                                    int& frame_complete,
1838                                    int& stack_slots,
1839                                    int& compiled_entry_offset) {
1840   enum layout {
1841     rbp_off,
1842     rbpH_off,
1843     return_off,
1844     return_off2,
1845     framesize // inclusive of return address
1846   };
1847   stack_slots = framesize /  VMRegImpl::slots_per_word;
1848   assert(stack_slots == 2, "recheck layout");
1849 
1850   address start = __ pc();
1851   compiled_entry_offset = __ pc() - start;
1852   __ enter();
1853   address the_pc = __ pc();
1854 
1855   frame_complete = the_pc - start;
1856 
1857   // This nop must be exactly at the PC we push into the frame info.
1858   // We use this nop for fast CodeBlob lookup, associate the OopMap
1859   // with it right away.
1860   __ post_call_nop();
1861   OopMap* map = new OopMap(framesize, 1);
1862   oop_maps->add_gc_map(frame_complete, map);
1863 
1864   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1865   __ movptr(c_rarg0, r15_thread);
1866   __ movptr(c_rarg1, rsp);
1867   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1868   __ reset_last_Java_frame(true);
1869 
1870   Label L_pinned;
1871 
1872   __ testptr(rax, rax);
1873   __ jcc(Assembler::notZero, L_pinned);
1874 
1875   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1876   continuation_enter_cleanup(masm);
1877   __ pop(rbp);
1878   __ ret(0);
1879 
1880   __ bind(L_pinned);
1881 
1882   // Pinned, return to caller
1883 
1884   // handle pending exception thrown by freeze
1885   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1886   Label ok;
1887   __ jcc(Assembler::equal, ok);
1888   __ leave();
1889   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1890   __ bind(ok);
1891 
1892   __ leave();
1893   __ ret(0);
1894 }
1895 
1896 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1897   ::continuation_enter_cleanup(masm);
1898 }
1899 
1900 static void gen_special_dispatch(MacroAssembler* masm,
1901                                  const methodHandle& method,
1902                                  const BasicType* sig_bt,
1903                                  const VMRegPair* regs) {
1904   verify_oop_args(masm, method, sig_bt, regs);
1905   vmIntrinsics::ID iid = method->intrinsic_id();
1906 
1907   // Now write the args into the outgoing interpreter space
1908   bool     has_receiver   = false;
1909   Register receiver_reg   = noreg;
1910   int      member_arg_pos = -1;
1911   Register member_reg     = noreg;
1912   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1913   if (ref_kind != 0) {
1914     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1915     member_reg = rbx;  // known to be free at this point
1916     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1917   } else if (iid == vmIntrinsics::_invokeBasic) {
1918     has_receiver = true;
1919   } else if (iid == vmIntrinsics::_linkToNative) {
1920     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1921     member_reg = rbx;  // known to be free at this point
1922   } else {
1923     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1924   }
1925 
1926   if (member_reg != noreg) {
1927     // Load the member_arg into register, if necessary.
1928     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1929     VMReg r = regs[member_arg_pos].first();
1930     if (r->is_stack()) {
1931       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1932     } else {
1933       // no data motion is needed
1934       member_reg = r->as_Register();
1935     }
1936   }
1937 
1938   if (has_receiver) {
1939     // Make sure the receiver is loaded into a register.
1940     assert(method->size_of_parameters() > 0, "oob");
1941     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1942     VMReg r = regs[0].first();
1943     assert(r->is_valid(), "bad receiver arg");
1944     if (r->is_stack()) {
1945       // Porting note:  This assumes that compiled calling conventions always
1946       // pass the receiver oop in a register.  If this is not true on some
1947       // platform, pick a temp and load the receiver from stack.
1948       fatal("receiver always in a register");
1949       receiver_reg = j_rarg0;  // known to be free at this point
1950       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1951     } else {
1952       // no data motion is needed
1953       receiver_reg = r->as_Register();
1954     }
1955   }
1956 
1957   // Figure out which address we are really jumping to:
1958   MethodHandles::generate_method_handle_dispatch(masm, iid,
1959                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1960 }
1961 
1962 // ---------------------------------------------------------------------------
1963 // Generate a native wrapper for a given method.  The method takes arguments
1964 // in the Java compiled code convention, marshals them to the native
1965 // convention (handlizes oops, etc), transitions to native, makes the call,
1966 // returns to java state (possibly blocking), unhandlizes any result and
1967 // returns.
1968 //
1969 // Critical native functions are a shorthand for the use of
1970 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1971 // functions.  The wrapper is expected to unpack the arguments before
1972 // passing them to the callee. Critical native functions leave the state _in_Java,
1973 // since they cannot stop for GC.
1974 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1975 // block and the check for pending exceptions it's impossible for them
1976 // to be thrown.
1977 //
1978 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1979                                                 const methodHandle& method,
1980                                                 int compile_id,
1981                                                 BasicType* in_sig_bt,
1982                                                 VMRegPair* in_regs,
1983                                                 BasicType ret_type) {
1984   if (method->is_continuation_native_intrinsic()) {
1985     int exception_offset = -1;
1986     OopMapSet* oop_maps = new OopMapSet();
1987     int frame_complete = -1;
1988     int stack_slots = -1;
1989     int interpreted_entry_offset = -1;
1990     int vep_offset = -1;
1991     if (method->is_continuation_enter_intrinsic()) {
1992       gen_continuation_enter(masm,
1993                              in_regs,
1994                              exception_offset,
1995                              oop_maps,
1996                              frame_complete,
1997                              stack_slots,
1998                              interpreted_entry_offset,
1999                              vep_offset);
2000     } else if (method->is_continuation_yield_intrinsic()) {
2001       gen_continuation_yield(masm,
2002                              in_regs,
2003                              oop_maps,
2004                              frame_complete,
2005                              stack_slots,
2006                              vep_offset);
2007     } else {
2008       guarantee(false, "Unknown Continuation native intrinsic");
2009     }
2010 
2011 #ifdef ASSERT
2012     if (method->is_continuation_enter_intrinsic()) {
2013       assert(interpreted_entry_offset != -1, "Must be set");
2014       assert(exception_offset != -1,         "Must be set");
2015     } else {
2016       assert(interpreted_entry_offset == -1, "Must be unset");
2017       assert(exception_offset == -1,         "Must be unset");
2018     }
2019     assert(frame_complete != -1,    "Must be set");
2020     assert(stack_slots != -1,       "Must be set");
2021     assert(vep_offset != -1,        "Must be set");
2022 #endif
2023 
2024     __ flush();
2025     nmethod* nm = nmethod::new_native_nmethod(method,
2026                                               compile_id,
2027                                               masm->code(),
2028                                               vep_offset,
2029                                               frame_complete,
2030                                               stack_slots,
2031                                               in_ByteSize(-1),
2032                                               in_ByteSize(-1),
2033                                               oop_maps,
2034                                               exception_offset);
2035     if (nm == nullptr) return nm;
2036     if (method->is_continuation_enter_intrinsic()) {
2037       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
2038     } else if (method->is_continuation_yield_intrinsic()) {
2039       _cont_doYield_stub = nm;
2040     }
2041     return nm;
2042   }
2043 
2044   if (method->is_method_handle_intrinsic()) {
2045     vmIntrinsics::ID iid = method->intrinsic_id();
2046     intptr_t start = (intptr_t)__ pc();
2047     int vep_offset = ((intptr_t)__ pc()) - start;
2048     gen_special_dispatch(masm,
2049                          method,
2050                          in_sig_bt,
2051                          in_regs);
2052     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
2053     __ flush();
2054     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
2055     return nmethod::new_native_nmethod(method,
2056                                        compile_id,
2057                                        masm->code(),
2058                                        vep_offset,
2059                                        frame_complete,
2060                                        stack_slots / VMRegImpl::slots_per_word,
2061                                        in_ByteSize(-1),
2062                                        in_ByteSize(-1),
2063                                        nullptr);
2064   }
2065   address native_func = method->native_function();
2066   assert(native_func != nullptr, "must have function");
2067 
2068   // An OopMap for lock (and class if static)
2069   OopMapSet *oop_maps = new OopMapSet();
2070   intptr_t start = (intptr_t)__ pc();
2071 
2072   // We have received a description of where all the java arg are located
2073   // on entry to the wrapper. We need to convert these args to where
2074   // the jni function will expect them. To figure out where they go
2075   // we convert the java signature to a C signature by inserting
2076   // the hidden arguments as arg[0] and possibly arg[1] (static method)
2077 
2078   const int total_in_args = method->size_of_parameters();
2079   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
2080 
2081   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
2082   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
2083 
2084   int argc = 0;
2085   out_sig_bt[argc++] = T_ADDRESS;
2086   if (method->is_static()) {
2087     out_sig_bt[argc++] = T_OBJECT;
2088   }
2089 
2090   for (int i = 0; i < total_in_args ; i++ ) {
2091     out_sig_bt[argc++] = in_sig_bt[i];
2092   }
2093 
2094   // Now figure out where the args must be stored and how much stack space
2095   // they require.
2096   int out_arg_slots;
2097   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
2098 
2099   // Compute framesize for the wrapper.  We need to handlize all oops in
2100   // incoming registers
2101 
2102   // Calculate the total number of stack slots we will need.
2103 
2104   // First count the abi requirement plus all of the outgoing args
2105   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
2106 
2107   // Now the space for the inbound oop handle area
2108   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
2109 
2110   int oop_handle_offset = stack_slots;
2111   stack_slots += total_save_slots;
2112 
2113   // Now any space we need for handlizing a klass if static method
2114 
2115   int klass_slot_offset = 0;
2116   int klass_offset = -1;
2117   int lock_slot_offset = 0;
2118   bool is_static = false;
2119 
2120   if (method->is_static()) {
2121     klass_slot_offset = stack_slots;
2122     stack_slots += VMRegImpl::slots_per_word;
2123     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
2124     is_static = true;
2125   }
2126 
2127   // Plus a lock if needed
2128 
2129   if (method->is_synchronized()) {
2130     lock_slot_offset = stack_slots;
2131     stack_slots += VMRegImpl::slots_per_word;
2132   }
2133 
2134   // Now a place (+2) to save return values or temp during shuffling
2135   // + 4 for return address (which we own) and saved rbp
2136   stack_slots += 6;
2137 
2138   // Ok The space we have allocated will look like:
2139   //
2140   //
2141   // FP-> |                     |
2142   //      |---------------------|
2143   //      | 2 slots for moves   |
2144   //      |---------------------|
2145   //      | lock box (if sync)  |
2146   //      |---------------------| <- lock_slot_offset
2147   //      | klass (if static)   |
2148   //      |---------------------| <- klass_slot_offset
2149   //      | oopHandle area      |
2150   //      |---------------------| <- oop_handle_offset (6 java arg registers)
2151   //      | outbound memory     |
2152   //      | based arguments     |
2153   //      |                     |
2154   //      |---------------------|
2155   //      |                     |
2156   // SP-> | out_preserved_slots |
2157   //
2158   //
2159 
2160 
2161   // Now compute actual number of stack words we need rounding to make
2162   // stack properly aligned.
2163   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
2164 
2165   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
2166 
2167   // First thing make an ic check to see if we should even be here
2168 
2169   // We are free to use all registers as temps without saving them and
2170   // restoring them except rbp. rbp is the only callee save register
2171   // as far as the interpreter and the compiler(s) are concerned.
2172 
2173   const Register receiver = j_rarg0;
2174 
2175   Label exception_pending;
2176 
2177   assert_different_registers(receiver, rscratch1, rscratch2);
2178   __ verify_oop(receiver);
2179   __ ic_check(8 /* end_alignment */);
2180 
2181   int vep_offset = ((intptr_t)__ pc()) - start;
2182 
2183   if (method->needs_clinit_barrier()) {
2184     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
2185     Label L_skip_barrier;
2186     Register klass = r10;
2187     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
2188     __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
2189 
2190     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
2191 
2192     __ bind(L_skip_barrier);
2193   }
2194 
2195 #ifdef COMPILER1
2196   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2197   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2198     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2199   }
2200 #endif // COMPILER1
2201 
2202   // The instruction at the verified entry point must be 5 bytes or longer
2203   // because it can be patched on the fly by make_non_entrant. The stack bang
2204   // instruction fits that requirement.
2205 
2206   // Generate stack overflow check
2207   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
2208 
2209   // Generate a new frame for the wrapper.
2210   __ enter();
2211   // -2 because return address is already present and so is saved rbp
2212   __ subptr(rsp, stack_size - 2*wordSize);
2213 
2214   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2215   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
2216   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
2217 
2218   // Frame is now completed as far as size and linkage.
2219   int frame_complete = ((intptr_t)__ pc()) - start;
2220 
2221 #ifdef ASSERT
2222   __ check_stack_alignment(rsp, "improperly aligned stack");
2223 #endif /* ASSERT */
2224 
2225 
2226   // We use r14 as the oop handle for the receiver/klass
2227   // It is callee save so it survives the call to native
2228 
2229   const Register oop_handle_reg = r14;
2230 
2231   //
2232   // We immediately shuffle the arguments so that any vm call we have to
2233   // make from here on out (sync slow path, jvmti, etc.) we will have
2234   // captured the oops from our caller and have a valid oopMap for
2235   // them.
2236 
2237   // -----------------
2238   // The Grand Shuffle
2239 
2240   // The Java calling convention is either equal (linux) or denser (win64) than the
2241   // c calling convention. However the because of the jni_env argument the c calling
2242   // convention always has at least one more (and two for static) arguments than Java.
2243   // Therefore if we move the args from java -> c backwards then we will never have
2244   // a register->register conflict and we don't have to build a dependency graph
2245   // and figure out how to break any cycles.
2246   //
2247 
2248   // Record esp-based slot for receiver on stack for non-static methods
2249   int receiver_offset = -1;
2250 
2251   // This is a trick. We double the stack slots so we can claim
2252   // the oops in the caller's frame. Since we are sure to have
2253   // more args than the caller doubling is enough to make
2254   // sure we can capture all the incoming oop args from the
2255   // caller.
2256   //
2257   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2258 
2259   // Mark location of rbp (someday)
2260   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2261 
2262   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2263   // All inbound args are referenced based on rbp and all outbound args via rsp.
2264 
2265 
2266 #ifdef ASSERT
2267   bool reg_destroyed[Register::number_of_registers];
2268   bool freg_destroyed[XMMRegister::number_of_registers];
2269   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2270     reg_destroyed[r] = false;
2271   }
2272   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2273     freg_destroyed[f] = false;
2274   }
2275 
2276 #endif /* ASSERT */
2277 
2278   // For JNI natives the incoming and outgoing registers are offset upwards.
2279   GrowableArray<int> arg_order(2 * total_in_args);
2280 
2281   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2282     arg_order.push(i);
2283     arg_order.push(c_arg);
2284   }
2285 
2286   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2287     int i = arg_order.at(ai);
2288     int c_arg = arg_order.at(ai + 1);
2289     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2290 #ifdef ASSERT
2291     if (in_regs[i].first()->is_Register()) {
2292       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2293     } else if (in_regs[i].first()->is_XMMRegister()) {
2294       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2295     }
2296     if (out_regs[c_arg].first()->is_Register()) {
2297       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2298     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2299       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2300     }
2301 #endif /* ASSERT */
2302     switch (in_sig_bt[i]) {
2303       case T_ARRAY:
2304       case T_OBJECT:
2305         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2306                     ((i == 0) && (!is_static)),
2307                     &receiver_offset);
2308         break;
2309       case T_VOID:
2310         break;
2311 
2312       case T_FLOAT:
2313         __ float_move(in_regs[i], out_regs[c_arg]);
2314           break;
2315 
2316       case T_DOUBLE:
2317         assert( i + 1 < total_in_args &&
2318                 in_sig_bt[i + 1] == T_VOID &&
2319                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2320         __ double_move(in_regs[i], out_regs[c_arg]);
2321         break;
2322 
2323       case T_LONG :
2324         __ long_move(in_regs[i], out_regs[c_arg]);
2325         break;
2326 
2327       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2328 
2329       default:
2330         __ move32_64(in_regs[i], out_regs[c_arg]);
2331     }
2332   }
2333 
2334   int c_arg;
2335 
2336   // Pre-load a static method's oop into r14.  Used both by locking code and
2337   // the normal JNI call code.
2338   // point c_arg at the first arg that is already loaded in case we
2339   // need to spill before we call out
2340   c_arg = total_c_args - total_in_args;
2341 
2342   if (method->is_static()) {
2343 
2344     //  load oop into a register
2345     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2346 
2347     // Now handlize the static class mirror it's known not-null.
2348     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2349     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2350 
2351     // Now get the handle
2352     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2353     // store the klass handle as second argument
2354     __ movptr(c_rarg1, oop_handle_reg);
2355     // and protect the arg if we must spill
2356     c_arg--;
2357   }
2358 
2359   // Change state to native (we save the return address in the thread, since it might not
2360   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2361   // points into the right code segment. It does not have to be the correct return pc.
2362   // We use the same pc/oopMap repeatedly when we call out
2363 
2364   Label native_return;
2365   if (method->is_object_wait0()) {
2366     // For convenience we use the pc we want to resume to in case of preemption on Object.wait.
2367     __ set_last_Java_frame(rsp, noreg, native_return, rscratch1);
2368   } else {
2369     intptr_t the_pc = (intptr_t) __ pc();
2370     oop_maps->add_gc_map(the_pc - start, map);
2371 
2372     __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1);
2373   }
2374 
2375   // We have all of the arguments setup at this point. We must not touch any register
2376   // argument registers at this point (what if we save/restore them there are no oop?
2377 
2378   if (DTraceMethodProbes) {
2379     // protect the args we've loaded
2380     save_args(masm, total_c_args, c_arg, out_regs);
2381     __ mov_metadata(c_rarg1, method());
2382     __ call_VM_leaf(
2383       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2384       r15_thread, c_rarg1);
2385     restore_args(masm, total_c_args, c_arg, out_regs);
2386   }
2387 
2388   // RedefineClasses() tracing support for obsolete method entry
2389   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2390     // protect the args we've loaded
2391     save_args(masm, total_c_args, c_arg, out_regs);
2392     __ mov_metadata(c_rarg1, method());
2393     __ call_VM_leaf(
2394       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2395       r15_thread, c_rarg1);
2396     restore_args(masm, total_c_args, c_arg, out_regs);
2397   }
2398 
2399   // Lock a synchronized method
2400 
2401   // Register definitions used by locking and unlocking
2402 
2403   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2404   const Register obj_reg  = rbx;  // Will contain the oop
2405   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2406 
2407   Label slow_path_lock;
2408   Label lock_done;
2409 
2410   if (method->is_synchronized()) {
2411     // Get the handle (the 2nd argument)
2412     __ mov(oop_handle_reg, c_rarg1);
2413 
2414     // Get address of the box
2415 
2416     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2417 
2418     // Load the oop from the handle
2419     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2420 
2421     __ fast_lock(lock_reg, obj_reg, swap_reg, rscratch1, slow_path_lock);
2422 
2423     // Slow path will re-enter here
2424     __ bind(lock_done);
2425   }
2426 
2427   // Finally just about ready to make the JNI call
2428 
2429   // get JNIEnv* which is first argument to native
2430   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2431 
2432   // Now set thread in native
2433   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2434 
2435   __ call(RuntimeAddress(native_func));
2436 
2437   // Verify or restore cpu control state after JNI call
2438   __ restore_cpu_control_state_after_jni(rscratch1);
2439 
2440   // Unpack native results.
2441   switch (ret_type) {
2442   case T_BOOLEAN: __ c2bool(rax);            break;
2443   case T_CHAR   : __ movzwl(rax, rax);      break;
2444   case T_BYTE   : __ sign_extend_byte (rax); break;
2445   case T_SHORT  : __ sign_extend_short(rax); break;
2446   case T_INT    : /* nothing to do */        break;
2447   case T_DOUBLE :
2448   case T_FLOAT  :
2449     // Result is in xmm0 we'll save as needed
2450     break;
2451   case T_ARRAY:                 // Really a handle
2452   case T_OBJECT:                // Really a handle
2453       break; // can't de-handlize until after safepoint check
2454   case T_VOID: break;
2455   case T_LONG: break;
2456   default       : ShouldNotReachHere();
2457   }
2458 
2459   // Switch thread to "native transition" state before reading the synchronization state.
2460   // This additional state is necessary because reading and testing the synchronization
2461   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2462   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2463   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2464   //     Thread A is resumed to finish this native method, but doesn't block here since it
2465   //     didn't see any synchronization is progress, and escapes.
2466   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2467 
2468   // Force this write out before the read below
2469   if (!UseSystemMemoryBarrier) {
2470     __ membar(Assembler::Membar_mask_bits(
2471               Assembler::LoadLoad | Assembler::LoadStore |
2472               Assembler::StoreLoad | Assembler::StoreStore));
2473   }
2474 
2475   // check for safepoint operation in progress and/or pending suspend requests
2476   {
2477     Label Continue;
2478     Label slow_path;
2479 
2480     __ safepoint_poll(slow_path, true /* at_return */, false /* in_nmethod */);
2481 
2482     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2483     __ jcc(Assembler::equal, Continue);
2484     __ bind(slow_path);
2485 
2486     // Don't use call_VM as it will see a possible pending exception and forward it
2487     // and never return here preventing us from clearing _last_native_pc down below.
2488     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2489     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2490     // by hand.
2491     //
2492     __ vzeroupper();
2493     save_native_result(masm, ret_type, stack_slots);
2494     __ mov(c_rarg0, r15_thread);
2495     __ mov(r12, rsp); // remember sp
2496     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2497     __ andptr(rsp, -16); // align stack as required by ABI
2498     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2499     __ mov(rsp, r12); // restore sp
2500     __ reinit_heapbase();
2501     // Restore any method result value
2502     restore_native_result(masm, ret_type, stack_slots);
2503     __ bind(Continue);
2504   }
2505 
2506   // change thread state
2507   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2508 
2509   if (method->is_object_wait0()) {
2510     // Check preemption for Object.wait()
2511     __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset()));
2512     __ cmpptr(rscratch1, NULL_WORD);
2513     __ jccb(Assembler::equal, native_return);
2514     __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD);
2515     __ jmp(rscratch1);
2516     __ bind(native_return);
2517 
2518     intptr_t the_pc = (intptr_t) __ pc();
2519     oop_maps->add_gc_map(the_pc - start, map);
2520   }
2521 
2522 
2523   Label reguard;
2524   Label reguard_done;
2525   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2526   __ jcc(Assembler::equal, reguard);
2527   __ bind(reguard_done);
2528 
2529   // native result if any is live
2530 
2531   // Unlock
2532   Label slow_path_unlock;
2533   Label unlock_done;
2534   if (method->is_synchronized()) {
2535 
2536     Label fast_done;
2537 
2538     // Get locked oop from the handle we passed to jni
2539     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2540 
2541     // Must save rax if it is live now because cmpxchg must use it
2542     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2543       save_native_result(masm, ret_type, stack_slots);
2544     }
2545 
2546     __ fast_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2547 
2548     // slow path re-enters here
2549     __ bind(unlock_done);
2550     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2551       restore_native_result(masm, ret_type, stack_slots);
2552     }
2553 
2554     __ bind(fast_done);
2555   }
2556   if (DTraceMethodProbes) {
2557     save_native_result(masm, ret_type, stack_slots);
2558     __ mov_metadata(c_rarg1, method());
2559     __ call_VM_leaf(
2560          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2561          r15_thread, c_rarg1);
2562     restore_native_result(masm, ret_type, stack_slots);
2563   }
2564 
2565   __ reset_last_Java_frame(false);
2566 
2567   // Unbox oop result, e.g. JNIHandles::resolve value.
2568   if (is_reference_type(ret_type)) {
2569     __ resolve_jobject(rax /* value */,
2570                        rcx /* tmp */);
2571   }
2572 
2573   if (CheckJNICalls) {
2574     // clear_pending_jni_exception_check
2575     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2576   }
2577 
2578   // reset handle block
2579   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2580   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2581 
2582   // pop our frame
2583 
2584   __ leave();
2585 
2586 #if INCLUDE_JFR
2587   // We need to do a poll test after unwind in case the sampler
2588   // managed to sample the native frame after returning to Java.
2589   Label L_return;
2590   address poll_test_pc = __ pc();
2591   __ relocate(relocInfo::poll_return_type);
2592   __ testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2593   __ jccb(Assembler::zero, L_return);
2594   __ lea(rscratch1, InternalAddress(poll_test_pc));
2595   __ movptr(Address(r15_thread, JavaThread::saved_exception_pc_offset()), rscratch1);
2596   assert(SharedRuntime::polling_page_return_handler_blob() != nullptr,
2597     "polling page return stub not created yet");
2598   address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
2599   __ jump(RuntimeAddress(stub));
2600   __ bind(L_return);
2601 #endif // INCLUDE_JFR
2602 
2603   // Any exception pending?
2604   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2605   __ jcc(Assembler::notEqual, exception_pending);
2606 
2607   // Return
2608 
2609   __ ret(0);
2610 
2611   // Unexpected paths are out of line and go here
2612 
2613   // forward the exception
2614   __ bind(exception_pending);
2615 
2616   // and forward the exception
2617   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2618 
2619   // Slow path locking & unlocking
2620   if (method->is_synchronized()) {
2621 
2622     // BEGIN Slow path lock
2623     __ bind(slow_path_lock);
2624 
2625     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2626     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2627 
2628     // protect the args we've loaded
2629     save_args(masm, total_c_args, c_arg, out_regs);
2630 
2631     __ mov(c_rarg0, obj_reg);
2632     __ mov(c_rarg1, lock_reg);
2633     __ mov(c_rarg2, r15_thread);
2634 
2635     // Not a leaf but we have last_Java_frame setup as we want.
2636     // We don't want to unmount in case of contention since that would complicate preserving
2637     // the arguments that had already been marshalled into the native convention. So we force
2638     // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame())
2639     // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack.
2640     __ push_cont_fastpath();
2641     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2642     __ pop_cont_fastpath();
2643     restore_args(masm, total_c_args, c_arg, out_regs);
2644 
2645 #ifdef ASSERT
2646     { Label L;
2647     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2648     __ jcc(Assembler::equal, L);
2649     __ stop("no pending exception allowed on exit from monitorenter");
2650     __ bind(L);
2651     }
2652 #endif
2653     __ jmp(lock_done);
2654 
2655     // END Slow path lock
2656 
2657     // BEGIN Slow path unlock
2658     __ bind(slow_path_unlock);
2659 
2660     // If we haven't already saved the native result we must save it now as xmm registers
2661     // are still exposed.
2662     __ vzeroupper();
2663     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2664       save_native_result(masm, ret_type, stack_slots);
2665     }
2666 
2667     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2668 
2669     __ mov(c_rarg0, obj_reg);
2670     __ mov(c_rarg2, r15_thread);
2671     __ mov(r12, rsp); // remember sp
2672     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2673     __ andptr(rsp, -16); // align stack as required by ABI
2674 
2675     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2676     // NOTE that obj_reg == rbx currently
2677     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2678     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2679 
2680     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2681     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2682     __ mov(rsp, r12); // restore sp
2683     __ reinit_heapbase();
2684 #ifdef ASSERT
2685     {
2686       Label L;
2687       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2688       __ jcc(Assembler::equal, L);
2689       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2690       __ bind(L);
2691     }
2692 #endif /* ASSERT */
2693 
2694     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2695 
2696     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2697       restore_native_result(masm, ret_type, stack_slots);
2698     }
2699     __ jmp(unlock_done);
2700 
2701     // END Slow path unlock
2702 
2703   } // synchronized
2704 
2705   // SLOW PATH Reguard the stack if needed
2706 
2707   __ bind(reguard);
2708   __ vzeroupper();
2709   save_native_result(masm, ret_type, stack_slots);
2710   __ mov(r12, rsp); // remember sp
2711   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2712   __ andptr(rsp, -16); // align stack as required by ABI
2713   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2714   __ mov(rsp, r12); // restore sp
2715   __ reinit_heapbase();
2716   restore_native_result(masm, ret_type, stack_slots);
2717   // and continue
2718   __ jmp(reguard_done);
2719 
2720 
2721 
2722   __ flush();
2723 
2724   nmethod *nm = nmethod::new_native_nmethod(method,
2725                                             compile_id,
2726                                             masm->code(),
2727                                             vep_offset,
2728                                             frame_complete,
2729                                             stack_slots / VMRegImpl::slots_per_word,
2730                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2731                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2732                                             oop_maps);
2733 
2734   return nm;
2735 }
2736 
2737 // this function returns the adjust size (in number of words) to a c2i adapter
2738 // activation for use during deoptimization
2739 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2740   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2741 }
2742 
2743 
2744 uint SharedRuntime::out_preserve_stack_slots() {
2745   return 0;
2746 }
2747 
2748 
2749 // Number of stack slots between incoming argument block and the start of
2750 // a new frame.  The PROLOG must add this many slots to the stack.  The
2751 // EPILOG must remove this many slots.  amd64 needs two slots for
2752 // return address.
2753 uint SharedRuntime::in_preserve_stack_slots() {
2754   return 4 + 2 * VerifyStackAtCalls;
2755 }
2756 
2757 VMReg SharedRuntime::thread_register() {
2758   return r15_thread->as_VMReg();
2759 }
2760 
2761 //------------------------------generate_deopt_blob----------------------------
2762 void SharedRuntime::generate_deopt_blob() {
2763   // Allocate space for the code
2764   ResourceMark rm;
2765   // Setup code generation tools
2766   int pad = 0;
2767   if (UseAVX > 2) {
2768     pad += 1024;
2769   }
2770   if (UseAPX) {
2771     pad += 1024;
2772   }
2773   const char* name = SharedRuntime::stub_name(StubId::shared_deopt_id);
2774   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
2775   if (blob != nullptr) {
2776     _deopt_blob = blob->as_deoptimization_blob();
2777     return;
2778   }
2779 
2780   CodeBuffer buffer(name, 2560+pad, 1024);
2781   MacroAssembler* masm = new MacroAssembler(&buffer);
2782   int frame_size_in_words;
2783   OopMap* map = nullptr;
2784   OopMapSet *oop_maps = new OopMapSet();
2785 
2786   // -------------
2787   // This code enters when returning to a de-optimized nmethod.  A return
2788   // address has been pushed on the stack, and return values are in
2789   // registers.
2790   // If we are doing a normal deopt then we were called from the patched
2791   // nmethod from the point we returned to the nmethod. So the return
2792   // address on the stack is wrong by NativeCall::instruction_size
2793   // We will adjust the value so it looks like we have the original return
2794   // address on the stack (like when we eagerly deoptimized).
2795   // In the case of an exception pending when deoptimizing, we enter
2796   // with a return address on the stack that points after the call we patched
2797   // into the exception handler. We have the following register state from,
2798   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2799   //    rax: exception oop
2800   //    rbx: exception handler
2801   //    rdx: throwing pc
2802   // So in this case we simply jam rdx into the useless return address and
2803   // the stack looks just like we want.
2804   //
2805   // At this point we need to de-opt.  We save the argument return
2806   // registers.  We call the first C routine, fetch_unroll_info().  This
2807   // routine captures the return values and returns a structure which
2808   // describes the current frame size and the sizes of all replacement frames.
2809   // The current frame is compiled code and may contain many inlined
2810   // functions, each with their own JVM state.  We pop the current frame, then
2811   // push all the new frames.  Then we call the C routine unpack_frames() to
2812   // populate these frames.  Finally unpack_frames() returns us the new target
2813   // address.  Notice that callee-save registers are BLOWN here; they have
2814   // already been captured in the vframeArray at the time the return PC was
2815   // patched.
2816   address start = __ pc();
2817   Label cont;
2818 
2819   // Prolog for non exception case!
2820 
2821   // Save everything in sight.
2822   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2823 
2824   // Normal deoptimization.  Save exec mode for unpack_frames.
2825   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2826   __ jmp(cont);
2827 
2828   int reexecute_offset = __ pc() - start;
2829   // Reexecute case
2830   // return address is the pc describes what bci to do re-execute at
2831 
2832   // No need to update map as each call to save_live_registers will produce identical oopmap
2833   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2834 
2835   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2836   __ jmp(cont);
2837 
2838   int exception_offset = __ pc() - start;
2839 
2840   // Prolog for exception case
2841 
2842   // all registers are dead at this entry point, except for rax, and
2843   // rdx which contain the exception oop and exception pc
2844   // respectively.  Set them in TLS and fall thru to the
2845   // unpack_with_exception_in_tls entry point.
2846 
2847   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2848   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2849 
2850   int exception_in_tls_offset = __ pc() - start;
2851 
2852   // new implementation because exception oop is now passed in JavaThread
2853 
2854   // Prolog for exception case
2855   // All registers must be preserved because they might be used by LinearScan
2856   // Exceptiop oop and throwing PC are passed in JavaThread
2857   // tos: stack at point of call to method that threw the exception (i.e. only
2858   // args are on the stack, no return address)
2859 
2860   // make room on stack for the return address
2861   // It will be patched later with the throwing pc. The correct value is not
2862   // available now because loading it from memory would destroy registers.
2863   __ push(0);
2864 
2865   // Save everything in sight.
2866   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2867 
2868   // Now it is safe to overwrite any register
2869 
2870   // Deopt during an exception.  Save exec mode for unpack_frames.
2871   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2872 
2873   // load throwing pc from JavaThread and patch it as the return address
2874   // of the current frame. Then clear the field in JavaThread
2875 
2876   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2877   __ movptr(Address(rbp, wordSize), rdx);
2878   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2879 
2880 #ifdef ASSERT
2881   // verify that there is really an exception oop in JavaThread
2882   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2883   __ verify_oop(rax);
2884 
2885   // verify that there is no pending exception
2886   Label no_pending_exception;
2887   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2888   __ testptr(rax, rax);
2889   __ jcc(Assembler::zero, no_pending_exception);
2890   __ stop("must not have pending exception here");
2891   __ bind(no_pending_exception);
2892 #endif
2893 
2894   __ bind(cont);
2895 
2896   // Call C code.  Need thread and this frame, but NOT official VM entry
2897   // crud.  We cannot block on this call, no GC can happen.
2898   //
2899   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2900 
2901   // fetch_unroll_info needs to call last_java_frame().
2902 
2903   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2904 #ifdef ASSERT
2905   { Label L;
2906     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2907     __ jcc(Assembler::equal, L);
2908     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2909     __ bind(L);
2910   }
2911 #endif // ASSERT
2912   __ mov(c_rarg0, r15_thread);
2913   __ movl(c_rarg1, r14); // exec_mode
2914   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2915 
2916   // Need to have an oopmap that tells fetch_unroll_info where to
2917   // find any register it might need.
2918   oop_maps->add_gc_map(__ pc() - start, map);
2919 
2920   __ reset_last_Java_frame(false);
2921 
2922   // Load UnrollBlock* into rdi
2923   __ mov(rdi, rax);
2924 
2925   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2926    Label noException;
2927   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2928   __ jcc(Assembler::notEqual, noException);
2929   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2930   // QQQ this is useless it was null above
2931   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2932   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2933   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2934 
2935   __ verify_oop(rax);
2936 
2937   // Overwrite the result registers with the exception results.
2938   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2939   // I think this is useless
2940   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2941 
2942   __ bind(noException);
2943 
2944   // Only register save data is on the stack.
2945   // Now restore the result registers.  Everything else is either dead
2946   // or captured in the vframeArray.
2947   RegisterSaver::restore_result_registers(masm);
2948 
2949   // All of the register save area has been popped of the stack. Only the
2950   // return address remains.
2951 
2952   // Pop all the frames we must move/replace.
2953   //
2954   // Frame picture (youngest to oldest)
2955   // 1: self-frame (no frame link)
2956   // 2: deopting frame  (no frame link)
2957   // 3: caller of deopting frame (could be compiled/interpreted).
2958   //
2959   // Note: by leaving the return address of self-frame on the stack
2960   // and using the size of frame 2 to adjust the stack
2961   // when we are done the return to frame 3 will still be on the stack.
2962 
2963   // Pop deoptimized frame
2964   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2965   __ addptr(rsp, rcx);
2966 
2967   // rsp should be pointing at the return address to the caller (3)
2968 
2969   // Pick up the initial fp we should save
2970   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2971   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2972 
2973 #ifdef ASSERT
2974   // Compilers generate code that bang the stack by as much as the
2975   // interpreter would need. So this stack banging should never
2976   // trigger a fault. Verify that it does not on non product builds.
2977   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2978   __ bang_stack_size(rbx, rcx);
2979 #endif
2980 
2981   // Load address of array of frame pcs into rcx
2982   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2983 
2984   // Trash the old pc
2985   __ addptr(rsp, wordSize);
2986 
2987   // Load address of array of frame sizes into rsi
2988   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2989 
2990   // Load counter into rdx
2991   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2992 
2993   // Now adjust the caller's stack to make up for the extra locals
2994   // but record the original sp so that we can save it in the skeletal interpreter
2995   // frame and the stack walking of interpreter_sender will get the unextended sp
2996   // value and not the "real" sp value.
2997 
2998   const Register sender_sp = r8;
2999 
3000   __ mov(sender_sp, rsp);
3001   __ movl(rbx, Address(rdi,
3002                        Deoptimization::UnrollBlock::
3003                        caller_adjustment_offset()));
3004   __ subptr(rsp, rbx);
3005 
3006   // Push interpreter frames in a loop
3007   Label loop;
3008   __ bind(loop);
3009   __ movptr(rbx, Address(rsi, 0));      // Load frame size
3010   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
3011   __ pushptr(Address(rcx, 0));          // Save return address
3012   __ enter();                           // Save old & set new ebp
3013   __ subptr(rsp, rbx);                  // Prolog
3014   // This value is corrected by layout_activation_impl
3015   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3016   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
3017   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
3018   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
3019   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
3020   __ decrementl(rdx);                   // Decrement counter
3021   __ jcc(Assembler::notZero, loop);
3022   __ pushptr(Address(rcx, 0));          // Save final return address
3023 
3024   // Re-push self-frame
3025   __ enter();                           // Save old & set new ebp
3026 
3027   // Allocate a full sized register save area.
3028   // Return address and rbp are in place, so we allocate two less words.
3029   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
3030 
3031   // Restore frame locals after moving the frame
3032   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
3033   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3034 
3035   // Call C code.  Need thread but NOT official VM entry
3036   // crud.  We cannot block on this call, no GC can happen.  Call should
3037   // restore return values to their stack-slots with the new SP.
3038   //
3039   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
3040 
3041   // Use rbp because the frames look interpreted now
3042   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3043   // Don't need the precise return PC here, just precise enough to point into this code blob.
3044   address the_pc = __ pc();
3045   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3046 
3047   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
3048   __ mov(c_rarg0, r15_thread);
3049   __ movl(c_rarg1, r14); // second arg: exec_mode
3050   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3051   // Revert SP alignment after call since we're going to do some SP relative addressing below
3052   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
3053 
3054   // Set an oopmap for the call site
3055   // Use the same PC we used for the last java frame
3056   oop_maps->add_gc_map(the_pc - start,
3057                        new OopMap( frame_size_in_words, 0 ));
3058 
3059   // Clear fp AND pc
3060   __ reset_last_Java_frame(true);
3061 
3062   // Collect return values
3063   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
3064   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
3065   // I think this is useless (throwing pc?)
3066   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
3067 
3068   // Pop self-frame.
3069   __ leave();                           // Epilog
3070 
3071   // Jump to interpreter
3072   __ ret(0);
3073 
3074   // Make sure all code is generated
3075   masm->flush();
3076 
3077   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
3078   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
3079 
3080   AOTCodeCache::store_code_blob(*_deopt_blob, AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
3081 }
3082 
3083 //------------------------------generate_handler_blob------
3084 //
3085 // Generate a special Compile2Runtime blob that saves all registers,
3086 // and setup oopmap.
3087 //
3088 SafepointBlob* SharedRuntime::generate_handler_blob(StubId id, address call_ptr) {
3089   assert(StubRoutines::forward_exception_entry() != nullptr,
3090          "must be generated before");
3091   assert(is_polling_page_id(id), "expected a polling page stub id");
3092 
3093   // Allocate space for the code.  Setup code generation tools.
3094   const char* name = SharedRuntime::stub_name(id);
3095   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3096   if (blob != nullptr) {
3097     return blob->as_safepoint_blob();
3098   }
3099 
3100   ResourceMark rm;
3101   OopMapSet *oop_maps = new OopMapSet();
3102   OopMap* map;
3103   CodeBuffer buffer(name, 2548, 1024);
3104   MacroAssembler* masm = new MacroAssembler(&buffer);
3105 
3106   address start   = __ pc();
3107   address call_pc = nullptr;
3108   int frame_size_in_words;
3109   bool cause_return = (id == StubId::shared_polling_page_return_handler_id);
3110   bool save_wide_vectors = (id == StubId::shared_polling_page_vectors_safepoint_handler_id);
3111 
3112   // Make room for return address (or push it again)
3113   if (!cause_return) {
3114     __ push(rbx);
3115   }
3116 
3117   // Save registers, fpu state, and flags
3118   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3119 
3120   // The following is basically a call_VM.  However, we need the precise
3121   // address of the call in order to generate an oopmap. Hence, we do all the
3122   // work ourselves.
3123 
3124   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3125 
3126   // The return address must always be correct so that frame constructor never
3127   // sees an invalid pc.
3128 
3129   if (!cause_return) {
3130     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3131     // Additionally, rbx is a callee saved register and we can look at it later to determine
3132     // if someone changed the return address for us!
3133     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3134     __ movptr(Address(rbp, wordSize), rbx);
3135   }
3136 
3137   // Do the call
3138   __ mov(c_rarg0, r15_thread);
3139   __ call(RuntimeAddress(call_ptr));
3140 
3141   // Set an oopmap for the call site.  This oopmap will map all
3142   // oop-registers and debug-info registers as callee-saved.  This
3143   // will allow deoptimization at this safepoint to find all possible
3144   // debug-info recordings, as well as let GC find all oops.
3145 
3146   oop_maps->add_gc_map( __ pc() - start, map);
3147 
3148   Label noException;
3149 
3150   __ reset_last_Java_frame(false);
3151 
3152   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3153   __ jcc(Assembler::equal, noException);
3154 
3155   // Exception pending
3156 
3157   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3158 
3159   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3160 
3161   // No exception case
3162   __ bind(noException);
3163 
3164   Label no_adjust;
3165 #ifdef ASSERT
3166   Label bail;
3167 #endif
3168   if (!cause_return) {
3169     Label no_prefix, not_special, check_rex_prefix;
3170 
3171     // If our stashed return pc was modified by the runtime we avoid touching it
3172     __ cmpptr(rbx, Address(rbp, wordSize));
3173     __ jcc(Assembler::notEqual, no_adjust);
3174 
3175     // Skip over the poll instruction.
3176     // See NativeInstruction::is_safepoint_poll()
3177     // Possible encodings:
3178     //      85 00       test   %eax,(%rax)
3179     //      85 01       test   %eax,(%rcx)
3180     //      85 02       test   %eax,(%rdx)
3181     //      85 03       test   %eax,(%rbx)
3182     //      85 06       test   %eax,(%rsi)
3183     //      85 07       test   %eax,(%rdi)
3184     //
3185     //   41 85 00       test   %eax,(%r8)
3186     //   41 85 01       test   %eax,(%r9)
3187     //   41 85 02       test   %eax,(%r10)
3188     //   41 85 03       test   %eax,(%r11)
3189     //   41 85 06       test   %eax,(%r14)
3190     //   41 85 07       test   %eax,(%r15)
3191     //
3192     //      85 04 24    test   %eax,(%rsp)
3193     //   41 85 04 24    test   %eax,(%r12)
3194     //      85 45 00    test   %eax,0x0(%rbp)
3195     //   41 85 45 00    test   %eax,0x0(%r13)
3196     //
3197     // Notes:
3198     //  Format of legacy MAP0 test instruction:-
3199     //  [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32]
3200     //  o  For safepoint polling instruction "test %eax,(%rax)", encoding of first register
3201     //     operand and base register of memory operand is b/w [0-8), hence we do not require
3202     //     additional REX prefix where REX.B bit stores MSB bit of register encoding, which
3203     //     is why two bytes encoding is sufficient here.
3204     //  o  For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE
3205     //     register of memory operand is 1000, thus we need additional REX prefix in this case,
3206     //     there by adding additional byte to instruction encoding.
3207     //  o  In case BASE register is one of the 32 extended GPR registers available only on targets
3208     //     supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold
3209     //     most significant two bits of 5 bit register encoding.
3210 
3211     if (VM_Version::supports_apx_f()) {
3212       __ cmpb(Address(rbx, 0), Assembler::REX2);
3213       __ jccb(Assembler::notEqual, check_rex_prefix);
3214       __ addptr(rbx, 2);
3215       __ bind(check_rex_prefix);
3216     }
3217     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3218     __ jccb(Assembler::notEqual, no_prefix);
3219     __ addptr(rbx, 1);
3220     __ bind(no_prefix);
3221 #ifdef ASSERT
3222     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3223 #endif
3224     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3225     // r12/rsp 0x04
3226     // r13/rbp 0x05
3227     __ movzbq(rcx, Address(rbx, 1));
3228     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3229     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3230     __ cmpptr(rcx, 1);
3231     __ jccb(Assembler::above, not_special);
3232     __ addptr(rbx, 1);
3233     __ bind(not_special);
3234 #ifdef ASSERT
3235     // Verify the correct encoding of the poll we're about to skip.
3236     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3237     __ jcc(Assembler::notEqual, bail);
3238     // Mask out the modrm bits
3239     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3240     // rax encodes to 0, so if the bits are nonzero it's incorrect
3241     __ jcc(Assembler::notZero, bail);
3242 #endif
3243     // Adjust return pc forward to step over the safepoint poll instruction
3244     __ addptr(rbx, 2);
3245     __ movptr(Address(rbp, wordSize), rbx);
3246   }
3247 
3248   __ bind(no_adjust);
3249   // Normal exit, restore registers and exit.
3250   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3251   __ ret(0);
3252 
3253 #ifdef ASSERT
3254   __ bind(bail);
3255   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3256 #endif
3257 
3258   // Make sure all code is generated
3259   masm->flush();
3260 
3261   // Fill-out other meta info
3262   SafepointBlob* sp_blob = SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3263 
3264   AOTCodeCache::store_code_blob(*sp_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3265   return sp_blob;
3266 }
3267 
3268 //
3269 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3270 //
3271 // Generate a stub that calls into vm to find out the proper destination
3272 // of a java call. All the argument registers are live at this point
3273 // but since this is generic code we don't know what they are and the caller
3274 // must do any gc of the args.
3275 //
3276 RuntimeStub* SharedRuntime::generate_resolve_blob(StubId id, address destination) {
3277   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3278   assert(is_resolve_id(id), "expected a resolve stub id");
3279 
3280   const char* name = SharedRuntime::stub_name(id);
3281   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3282   if (blob != nullptr) {
3283     return blob->as_runtime_stub();
3284   }
3285 
3286   // allocate space for the code
3287   ResourceMark rm;
3288   CodeBuffer buffer(name, 1552, 512);
3289   MacroAssembler* masm = new MacroAssembler(&buffer);
3290 
3291   int frame_size_in_words;
3292 
3293   OopMapSet *oop_maps = new OopMapSet();
3294   OopMap* map = nullptr;
3295 
3296   int start = __ offset();
3297 
3298   // No need to save vector registers since they are caller-saved anyway.
3299   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3300 
3301   int frame_complete = __ offset();
3302 
3303   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3304 
3305   __ mov(c_rarg0, r15_thread);
3306 
3307   __ call(RuntimeAddress(destination));
3308 
3309 
3310   // Set an oopmap for the call site.
3311   // We need this not only for callee-saved registers, but also for volatile
3312   // registers that the compiler might be keeping live across a safepoint.
3313 
3314   oop_maps->add_gc_map( __ offset() - start, map);
3315 
3316   // rax contains the address we are going to jump to assuming no exception got installed
3317 
3318   // clear last_Java_sp
3319   __ reset_last_Java_frame(false);
3320   // check for pending exceptions
3321   Label pending;
3322   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3323   __ jcc(Assembler::notEqual, pending);
3324 
3325   // get the returned Method*
3326   __ get_vm_result_metadata(rbx);
3327   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3328 
3329   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3330 
3331   RegisterSaver::restore_live_registers(masm);
3332 
3333   // We are back to the original state on entry and ready to go.
3334 
3335   __ jmp(rax);
3336 
3337   // Pending exception after the safepoint
3338 
3339   __ bind(pending);
3340 
3341   RegisterSaver::restore_live_registers(masm);
3342 
3343   // exception pending => remove activation and forward to exception handler
3344 
3345   __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
3346 
3347   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3348   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3349 
3350   // -------------
3351   // make sure all code is generated
3352   masm->flush();
3353 
3354   // return the  blob
3355   // frame_size_words or bytes??
3356   RuntimeStub* rs_blob = RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3357 
3358   AOTCodeCache::store_code_blob(*rs_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3359   return rs_blob;
3360 }
3361 
3362 // Continuation point for throwing of implicit exceptions that are
3363 // not handled in the current activation. Fabricates an exception
3364 // oop and initiates normal exception dispatching in this
3365 // frame. Since we need to preserve callee-saved values (currently
3366 // only for C2, but done for C1 as well) we need a callee-saved oop
3367 // map and therefore have to make these stubs into RuntimeStubs
3368 // rather than BufferBlobs.  If the compiler needs all registers to
3369 // be preserved between the fault point and the exception handler
3370 // then it must assume responsibility for that in
3371 // AbstractCompiler::continuation_for_implicit_null_exception or
3372 // continuation_for_implicit_division_by_zero_exception. All other
3373 // implicit exceptions (e.g., NullPointerException or
3374 // AbstractMethodError on entry) are either at call sites or
3375 // otherwise assume that stack unwinding will be initiated, so
3376 // caller saved registers were assumed volatile in the compiler.
3377 RuntimeStub* SharedRuntime::generate_throw_exception(StubId id, address runtime_entry) {
3378   assert(is_throw_id(id), "expected a throw stub id");
3379 
3380   const char* name = SharedRuntime::stub_name(id);
3381 
3382   // Information about frame layout at time of blocking runtime call.
3383   // Note that we only have to preserve callee-saved registers since
3384   // the compilers are responsible for supplying a continuation point
3385   // if they expect all registers to be preserved.
3386   enum layout {
3387     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3388     rbp_off2,
3389     return_off,
3390     return_off2,
3391     framesize // inclusive of return address
3392   };
3393 
3394   int insts_size = 512;
3395   int locs_size  = 64;
3396 
3397   const char* timer_msg = "SharedRuntime generate_throw_exception";
3398   TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3399 
3400   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3401   if (blob != nullptr) {
3402     return blob->as_runtime_stub();
3403   }
3404 
3405   ResourceMark rm;
3406   CodeBuffer code(name, insts_size, locs_size);
3407   OopMapSet* oop_maps  = new OopMapSet();
3408   MacroAssembler* masm = new MacroAssembler(&code);
3409 
3410   address start = __ pc();
3411 
3412   // This is an inlined and slightly modified version of call_VM
3413   // which has the ability to fetch the return PC out of
3414   // thread-local storage and also sets up last_Java_sp slightly
3415   // differently than the real call_VM
3416 
3417   __ enter(); // required for proper stackwalking of RuntimeStub frame
3418 
3419   assert(is_even(framesize/2), "sp not 16-byte aligned");
3420 
3421   // return address and rbp are already in place
3422   __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3423 
3424   int frame_complete = __ pc() - start;
3425 
3426   // Set up last_Java_sp and last_Java_fp
3427   address the_pc = __ pc();
3428   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3429   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3430 
3431   // Call runtime
3432   __ movptr(c_rarg0, r15_thread);
3433   BLOCK_COMMENT("call runtime_entry");
3434   __ call(RuntimeAddress(runtime_entry));
3435 
3436   // Generate oop map
3437   OopMap* map = new OopMap(framesize, 0);
3438 
3439   oop_maps->add_gc_map(the_pc - start, map);
3440 
3441   __ reset_last_Java_frame(true);
3442 
3443   __ leave(); // required for proper stackwalking of RuntimeStub frame
3444 
3445   // check for pending exceptions
3446 #ifdef ASSERT
3447   Label L;
3448   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3449   __ jcc(Assembler::notEqual, L);
3450   __ should_not_reach_here();
3451   __ bind(L);
3452 #endif // ASSERT
3453   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3454 
3455 
3456   // codeBlob framesize is in words (not VMRegImpl::slot_size)
3457   RuntimeStub* stub =
3458     RuntimeStub::new_runtime_stub(name,
3459                                   &code,
3460                                   frame_complete,
3461                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3462                                   oop_maps, false);
3463   AOTCodeCache::store_code_blob(*stub, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3464 
3465   return stub;
3466 }
3467 
3468 //------------------------------Montgomery multiplication------------------------
3469 //
3470 
3471 #ifndef _WINDOWS
3472 
3473 // Subtract 0:b from carry:a.  Return carry.
3474 static julong
3475 sub(julong a[], julong b[], julong carry, long len) {
3476   long long i = 0, cnt = len;
3477   julong tmp;
3478   asm volatile("clc; "
3479                "0: ; "
3480                "mov (%[b], %[i], 8), %[tmp]; "
3481                "sbb %[tmp], (%[a], %[i], 8); "
3482                "inc %[i]; dec %[cnt]; "
3483                "jne 0b; "
3484                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3485                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3486                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3487                : "memory");
3488   return tmp;
3489 }
3490 
3491 // Multiply (unsigned) Long A by Long B, accumulating the double-
3492 // length result into the accumulator formed of T0, T1, and T2.
3493 #define MACC(A, B, T0, T1, T2)                                  \
3494 do {                                                            \
3495   unsigned long hi, lo;                                         \
3496   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3497            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3498            : "r"(A), "a"(B) : "cc");                            \
3499  } while(0)
3500 
3501 // As above, but add twice the double-length result into the
3502 // accumulator.
3503 #define MACC2(A, B, T0, T1, T2)                                 \
3504 do {                                                            \
3505   unsigned long hi, lo;                                         \
3506   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3507            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3508            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3509            : "r"(A), "a"(B) : "cc");                            \
3510  } while(0)
3511 
3512 #else //_WINDOWS
3513 
3514 static julong
3515 sub(julong a[], julong b[], julong carry, long len) {
3516   long i;
3517   julong tmp;
3518   unsigned char c = 1;
3519   for (i = 0; i < len; i++) {
3520     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3521     a[i] = tmp;
3522   }
3523   c = _addcarry_u64(c, carry, ~0, &tmp);
3524   return tmp;
3525 }
3526 
3527 // Multiply (unsigned) Long A by Long B, accumulating the double-
3528 // length result into the accumulator formed of T0, T1, and T2.
3529 #define MACC(A, B, T0, T1, T2)                          \
3530 do {                                                    \
3531   julong hi, lo;                            \
3532   lo = _umul128(A, B, &hi);                             \
3533   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3534   c = _addcarry_u64(c, hi, T1, &T1);                    \
3535   _addcarry_u64(c, T2, 0, &T2);                         \
3536  } while(0)
3537 
3538 // As above, but add twice the double-length result into the
3539 // accumulator.
3540 #define MACC2(A, B, T0, T1, T2)                         \
3541 do {                                                    \
3542   julong hi, lo;                            \
3543   lo = _umul128(A, B, &hi);                             \
3544   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3545   c = _addcarry_u64(c, hi, T1, &T1);                    \
3546   _addcarry_u64(c, T2, 0, &T2);                         \
3547   c = _addcarry_u64(0, lo, T0, &T0);                    \
3548   c = _addcarry_u64(c, hi, T1, &T1);                    \
3549   _addcarry_u64(c, T2, 0, &T2);                         \
3550  } while(0)
3551 
3552 #endif //_WINDOWS
3553 
3554 // Fast Montgomery multiplication.  The derivation of the algorithm is
3555 // in  A Cryptographic Library for the Motorola DSP56000,
3556 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3557 
3558 static void NOINLINE
3559 montgomery_multiply(julong a[], julong b[], julong n[],
3560                     julong m[], julong inv, int len) {
3561   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3562   int i;
3563 
3564   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3565 
3566   for (i = 0; i < len; i++) {
3567     int j;
3568     for (j = 0; j < i; j++) {
3569       MACC(a[j], b[i-j], t0, t1, t2);
3570       MACC(m[j], n[i-j], t0, t1, t2);
3571     }
3572     MACC(a[i], b[0], t0, t1, t2);
3573     m[i] = t0 * inv;
3574     MACC(m[i], n[0], t0, t1, t2);
3575 
3576     assert(t0 == 0, "broken Montgomery multiply");
3577 
3578     t0 = t1; t1 = t2; t2 = 0;
3579   }
3580 
3581   for (i = len; i < 2*len; i++) {
3582     int j;
3583     for (j = i-len+1; j < len; j++) {
3584       MACC(a[j], b[i-j], t0, t1, t2);
3585       MACC(m[j], n[i-j], t0, t1, t2);
3586     }
3587     m[i-len] = t0;
3588     t0 = t1; t1 = t2; t2 = 0;
3589   }
3590 
3591   while (t0)
3592     t0 = sub(m, n, t0, len);
3593 }
3594 
3595 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3596 // multiplies so it should be up to 25% faster than Montgomery
3597 // multiplication.  However, its loop control is more complex and it
3598 // may actually run slower on some machines.
3599 
3600 static void NOINLINE
3601 montgomery_square(julong a[], julong n[],
3602                   julong m[], julong inv, int len) {
3603   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3604   int i;
3605 
3606   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3607 
3608   for (i = 0; i < len; i++) {
3609     int j;
3610     int end = (i+1)/2;
3611     for (j = 0; j < end; j++) {
3612       MACC2(a[j], a[i-j], t0, t1, t2);
3613       MACC(m[j], n[i-j], t0, t1, t2);
3614     }
3615     if ((i & 1) == 0) {
3616       MACC(a[j], a[j], t0, t1, t2);
3617     }
3618     for (; j < i; j++) {
3619       MACC(m[j], n[i-j], t0, t1, t2);
3620     }
3621     m[i] = t0 * inv;
3622     MACC(m[i], n[0], t0, t1, t2);
3623 
3624     assert(t0 == 0, "broken Montgomery square");
3625 
3626     t0 = t1; t1 = t2; t2 = 0;
3627   }
3628 
3629   for (i = len; i < 2*len; i++) {
3630     int start = i-len+1;
3631     int end = start + (len - start)/2;
3632     int j;
3633     for (j = start; j < end; j++) {
3634       MACC2(a[j], a[i-j], t0, t1, t2);
3635       MACC(m[j], n[i-j], t0, t1, t2);
3636     }
3637     if ((i & 1) == 0) {
3638       MACC(a[j], a[j], t0, t1, t2);
3639     }
3640     for (; j < len; j++) {
3641       MACC(m[j], n[i-j], t0, t1, t2);
3642     }
3643     m[i-len] = t0;
3644     t0 = t1; t1 = t2; t2 = 0;
3645   }
3646 
3647   while (t0)
3648     t0 = sub(m, n, t0, len);
3649 }
3650 
3651 // Swap words in a longword.
3652 static julong swap(julong x) {
3653   return (x << 32) | (x >> 32);
3654 }
3655 
3656 // Copy len longwords from s to d, word-swapping as we go.  The
3657 // destination array is reversed.
3658 static void reverse_words(julong *s, julong *d, int len) {
3659   d += len;
3660   while(len-- > 0) {
3661     d--;
3662     *d = swap(*s);
3663     s++;
3664   }
3665 }
3666 
3667 // The threshold at which squaring is advantageous was determined
3668 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3669 #define MONTGOMERY_SQUARING_THRESHOLD 64
3670 
3671 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3672                                         jint len, jlong inv,
3673                                         jint *m_ints) {
3674   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3675   int longwords = len/2;
3676 
3677   // Make very sure we don't use so much space that the stack might
3678   // overflow.  512 jints corresponds to an 16384-bit integer and
3679   // will use here a total of 8k bytes of stack space.
3680   int divisor = sizeof(julong) * 4;
3681   guarantee(longwords <= 8192 / divisor, "must be");
3682   int total_allocation = longwords * sizeof (julong) * 4;
3683   julong *scratch = (julong *)alloca(total_allocation);
3684 
3685   // Local scratch arrays
3686   julong
3687     *a = scratch + 0 * longwords,
3688     *b = scratch + 1 * longwords,
3689     *n = scratch + 2 * longwords,
3690     *m = scratch + 3 * longwords;
3691 
3692   reverse_words((julong *)a_ints, a, longwords);
3693   reverse_words((julong *)b_ints, b, longwords);
3694   reverse_words((julong *)n_ints, n, longwords);
3695 
3696   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3697 
3698   reverse_words(m, (julong *)m_ints, longwords);
3699 }
3700 
3701 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3702                                       jint len, jlong inv,
3703                                       jint *m_ints) {
3704   assert(len % 2 == 0, "array length in montgomery_square must be even");
3705   int longwords = len/2;
3706 
3707   // Make very sure we don't use so much space that the stack might
3708   // overflow.  512 jints corresponds to an 16384-bit integer and
3709   // will use here a total of 6k bytes of stack space.
3710   int divisor = sizeof(julong) * 3;
3711   guarantee(longwords <= (8192 / divisor), "must be");
3712   int total_allocation = longwords * sizeof (julong) * 3;
3713   julong *scratch = (julong *)alloca(total_allocation);
3714 
3715   // Local scratch arrays
3716   julong
3717     *a = scratch + 0 * longwords,
3718     *n = scratch + 1 * longwords,
3719     *m = scratch + 2 * longwords;
3720 
3721   reverse_words((julong *)a_ints, a, longwords);
3722   reverse_words((julong *)n_ints, n, longwords);
3723 
3724   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3725     ::montgomery_square(a, n, m, (julong)inv, longwords);
3726   } else {
3727     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3728   }
3729 
3730   reverse_words(m, (julong *)m_ints, longwords);
3731 }
3732 
3733 BufferedInlineTypeBlob* SharedRuntime::generate_buffered_inline_type_adapter(const InlineKlass* vk) {
3734   BufferBlob* buf = BufferBlob::create("inline types pack/unpack", 16 * K);
3735   if (buf == nullptr) {
3736     return nullptr;
3737   }
3738   CodeBuffer buffer(buf);
3739   short buffer_locs[20];
3740   buffer.insts()->initialize_shared_locs((relocInfo*)buffer_locs,
3741                                          sizeof(buffer_locs)/sizeof(relocInfo));
3742 
3743   MacroAssembler* masm = new MacroAssembler(&buffer);
3744 
3745   const Array<SigEntry>* sig_vk = vk->extended_sig();
3746   const Array<VMRegPair>* regs = vk->return_regs();
3747 
3748   int pack_fields_jobject_off = __ offset();
3749   // Resolve pre-allocated buffer from JNI handle.
3750   // We cannot do this in generate_call_stub() because it requires GC code to be initialized.
3751   __ movptr(rax, Address(r13, 0));
3752   __ resolve_jobject(rax /* value */,
3753                      r12 /* tmp */);
3754   __ movptr(Address(r13, 0), rax);
3755 
3756   int pack_fields_off = __ offset();
3757 
3758   int j = 1;
3759   for (int i = 0; i < sig_vk->length(); i++) {
3760     BasicType bt = sig_vk->at(i)._bt;
3761     if (bt == T_METADATA) {
3762       continue;
3763     }
3764     if (bt == T_VOID) {
3765       if (sig_vk->at(i-1)._bt == T_LONG ||
3766           sig_vk->at(i-1)._bt == T_DOUBLE) {
3767         j++;
3768       }
3769       continue;
3770     }
3771     int off = sig_vk->at(i)._offset;
3772     assert(off > 0, "offset in object should be positive");
3773     VMRegPair pair = regs->at(j);
3774     VMReg r_1 = pair.first();
3775     Address to(rax, off);
3776     if (bt == T_FLOAT) {
3777       __ movflt(to, r_1->as_XMMRegister());
3778     } else if (bt == T_DOUBLE) {
3779       __ movdbl(to, r_1->as_XMMRegister());
3780     } else {
3781       Register val = r_1->as_Register();
3782       assert_different_registers(to.base(), val, r14, r13, rbx, rscratch1);
3783       if (is_reference_type(bt)) {
3784         // store_heap_oop transitively calls oop_store_at which corrupts to.base(). We need to keep rax valid.
3785         __ mov(rbx, rax);
3786         Address to_with_rbx(rbx, off);
3787         __ store_heap_oop(to_with_rbx, val, r14, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
3788       } else {
3789         __ store_sized_value(to, r_1->as_Register(), type2aelembytes(bt));
3790       }
3791     }
3792     j++;
3793   }
3794   assert(j == regs->length(), "missed a field?");
3795   if (vk->supports_nullable_layouts()) {
3796     // Set the null marker
3797     __ movb(Address(rax, vk->null_marker_offset()), 1);
3798   }
3799   __ ret(0);
3800 
3801   int unpack_fields_off = __ offset();
3802 
3803   Label skip;
3804   Label not_null;
3805   __ testptr(rax, rax);
3806   __ jcc(Assembler::notZero, not_null);
3807 
3808   // Return value is null. Zero all registers because the runtime requires a canonical
3809   // representation of a flat null.
3810   j = 1;
3811   for (int i = 0; i < sig_vk->length(); i++) {
3812     BasicType bt = sig_vk->at(i)._bt;
3813     if (bt == T_METADATA) {
3814       continue;
3815     }
3816     if (bt == T_VOID) {
3817       if (sig_vk->at(i-1)._bt == T_LONG ||
3818           sig_vk->at(i-1)._bt == T_DOUBLE) {
3819         j++;
3820       }
3821       continue;
3822     }
3823 
3824     VMRegPair pair = regs->at(j);
3825     VMReg r_1 = pair.first();
3826     if (r_1->is_XMMRegister()) {
3827       __ xorps(r_1->as_XMMRegister(), r_1->as_XMMRegister());
3828     } else {
3829       __ xorl(r_1->as_Register(), r_1->as_Register());
3830     }
3831     j++;
3832   }
3833   __ jmp(skip);
3834   __ bind(not_null);
3835 
3836   j = 1;
3837   for (int i = 0; i < sig_vk->length(); i++) {
3838     BasicType bt = sig_vk->at(i)._bt;
3839     if (bt == T_METADATA) {
3840       continue;
3841     }
3842     if (bt == T_VOID) {
3843       if (sig_vk->at(i-1)._bt == T_LONG ||
3844           sig_vk->at(i-1)._bt == T_DOUBLE) {
3845         j++;
3846       }
3847       continue;
3848     }
3849     int off = sig_vk->at(i)._offset;
3850     assert(off > 0, "offset in object should be positive");
3851     VMRegPair pair = regs->at(j);
3852     VMReg r_1 = pair.first();
3853     VMReg r_2 = pair.second();
3854     Address from(rax, off);
3855     if (bt == T_FLOAT) {
3856       __ movflt(r_1->as_XMMRegister(), from);
3857     } else if (bt == T_DOUBLE) {
3858       __ movdbl(r_1->as_XMMRegister(), from);
3859     } else if (bt == T_OBJECT || bt == T_ARRAY) {
3860       assert_different_registers(rax, r_1->as_Register());
3861       __ load_heap_oop(r_1->as_Register(), from);
3862     } else {
3863       assert(is_java_primitive(bt), "unexpected basic type");
3864       assert_different_registers(rax, r_1->as_Register());
3865       size_t size_in_bytes = type2aelembytes(bt);
3866       __ load_sized_value(r_1->as_Register(), from, size_in_bytes, bt != T_CHAR && bt != T_BOOLEAN);
3867     }
3868     j++;
3869   }
3870   assert(j == regs->length(), "missed a field?");
3871 
3872   __ bind(skip);
3873   __ ret(0);
3874 
3875   __ flush();
3876 
3877   return BufferedInlineTypeBlob::create(&buffer, pack_fields_off, pack_fields_jobject_off, unpack_fields_off);
3878 }
3879 
3880 #if INCLUDE_JFR
3881 
3882 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
3883 // It returns a jobject handle to the event writer.
3884 // The handle is dereferenced and the return value is the event writer oop.
3885 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
3886   enum layout {
3887     rbp_off,
3888     rbpH_off,
3889     return_off,
3890     return_off2,
3891     framesize // inclusive of return address
3892   };
3893 
3894   const char* name = SharedRuntime::stub_name(StubId::shared_jfr_write_checkpoint_id);
3895   CodeBuffer code(name, 1024, 64);
3896   MacroAssembler* masm = new MacroAssembler(&code);
3897   address start = __ pc();
3898 
3899   __ enter();
3900   address the_pc = __ pc();
3901 
3902   int frame_complete = the_pc - start;
3903 
3904   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3905   __ movptr(c_rarg0, r15_thread);
3906   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
3907   __ reset_last_Java_frame(true);
3908 
3909   // rax is jobject handle result, unpack and process it through a barrier.
3910   __ resolve_global_jobject(rax, c_rarg0);
3911 
3912   __ leave();
3913   __ ret(0);
3914 
3915   OopMapSet* oop_maps = new OopMapSet();
3916   OopMap* map = new OopMap(framesize, 1);
3917   oop_maps->add_gc_map(frame_complete, map);
3918 
3919   RuntimeStub* stub =
3920     RuntimeStub::new_runtime_stub(name,
3921                                   &code,
3922                                   frame_complete,
3923                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3924                                   oop_maps,
3925                                   false);
3926   return stub;
3927 }
3928 
3929 // For c2: call to return a leased buffer.
3930 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
3931   enum layout {
3932     rbp_off,
3933     rbpH_off,
3934     return_off,
3935     return_off2,
3936     framesize // inclusive of return address
3937   };
3938 
3939   const char* name = SharedRuntime::stub_name(StubId::shared_jfr_return_lease_id);
3940   CodeBuffer code(name, 1024, 64);
3941   MacroAssembler* masm = new MacroAssembler(&code);
3942   address start = __ pc();
3943 
3944   __ enter();
3945   address the_pc = __ pc();
3946 
3947   int frame_complete = the_pc - start;
3948 
3949   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
3950   __ movptr(c_rarg0, r15_thread);
3951   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
3952   __ reset_last_Java_frame(true);
3953 
3954   __ leave();
3955   __ ret(0);
3956 
3957   OopMapSet* oop_maps = new OopMapSet();
3958   OopMap* map = new OopMap(framesize, 1);
3959   oop_maps->add_gc_map(frame_complete, map);
3960 
3961   RuntimeStub* stub =
3962     RuntimeStub::new_runtime_stub(name,
3963                                   &code,
3964                                   frame_complete,
3965                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3966                                   oop_maps,
3967                                   false);
3968   return stub;
3969 }
3970 
3971 #endif // INCLUDE_JFR