1 /*
   2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef _WINDOWS
  26 #include "alloca.h"
  27 #endif
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "classfile/symbolTable.hpp"
  31 #include "code/compiledIC.hpp"
  32 #include "code/debugInfoRec.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/oopMap.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "gc/shared/gcLocker.hpp"
  38 #include "gc/shared/barrierSet.hpp"
  39 #include "gc/shared/barrierSetAssembler.hpp"
  40 #include "interpreter/interpreter.hpp"
  41 #include "logging/log.hpp"
  42 #include "memory/resourceArea.hpp"
  43 #include "memory/universe.hpp"
  44 #include "oops/klass.inline.hpp"
  45 #include "oops/method.inline.hpp"
  46 #include "prims/methodHandles.hpp"
  47 #include "runtime/continuation.hpp"
  48 #include "runtime/continuationEntry.inline.hpp"
  49 #include "runtime/globals.hpp"
  50 #include "runtime/jniHandles.hpp"
  51 #include "runtime/safepointMechanism.hpp"
  52 #include "runtime/sharedRuntime.hpp"
  53 #include "runtime/signature.hpp"
  54 #include "runtime/stubRoutines.hpp"
  55 #include "runtime/timerTrace.hpp"
  56 #include "runtime/vframeArray.hpp"
  57 #include "runtime/vm_version.hpp"
  58 #include "utilities/align.hpp"
  59 #include "utilities/checkedCast.hpp"
  60 #include "utilities/formatBuffer.hpp"
  61 #include "vmreg_x86.inline.hpp"
  62 #ifdef COMPILER1
  63 #include "c1/c1_Runtime1.hpp"
  64 #endif
  65 #ifdef COMPILER2
  66 #include "opto/runtime.hpp"
  67 #endif
  68 #if INCLUDE_JVMCI
  69 #include "jvmci/jvmciJavaClasses.hpp"
  70 #endif
  71 
  72 #define __ masm->
  73 
  74 #ifdef PRODUCT
  75 #define BLOCK_COMMENT(str) /* nothing */
  76 #else
  77 #define BLOCK_COMMENT(str) __ block_comment(str)
  78 #endif // PRODUCT
  79 
  80 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  81 
  82 class RegisterSaver {
  83   // Capture info about frame layout.  Layout offsets are in jint
  84   // units because compiler frame slots are jints.
  85 #define XSAVE_AREA_BEGIN 160
  86 #define XSAVE_AREA_YMM_BEGIN 576
  87 #define XSAVE_AREA_EGPRS 960
  88 #define XSAVE_AREA_OPMASK_BEGIN 1088
  89 #define XSAVE_AREA_ZMM_BEGIN 1152
  90 #define XSAVE_AREA_UPPERBANK 1664
  91 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  92 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  93 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
  94 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
  95 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
  96   enum layout {
  97     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt,    // fxsave save area
  98     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,  // offset in fxsave save area
  99     DEF_XMM_OFFS(0),
 100     DEF_XMM_OFFS(1),
 101     // 2..15 are implied in range usage
 102     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 103     DEF_YMM_OFFS(0),
 104     DEF_YMM_OFFS(1),
 105     r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
 106     r16H_off,
 107     r17_off, r17H_off,
 108     r18_off, r18H_off,
 109     r19_off, r19H_off,
 110     r20_off, r20H_off,
 111     r21_off, r21H_off,
 112     r22_off, r22H_off,
 113     r23_off, r23H_off,
 114     r24_off, r24H_off,
 115     r25_off, r25H_off,
 116     r26_off, r26H_off,
 117     r27_off, r27H_off,
 118     r28_off, r28H_off,
 119     r29_off, r29H_off,
 120     r30_off, r30H_off,
 121     r31_off, r31H_off,
 122     opmask_off   = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 123     DEF_OPMASK_OFFS(0),
 124     DEF_OPMASK_OFFS(1),
 125     // 2..7 are implied in range usage
 126     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 127     DEF_ZMM_OFFS(0),
 128     DEF_ZMM_OFFS(1),
 129     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 130     DEF_ZMM_UPPER_OFFS(16),
 131     DEF_ZMM_UPPER_OFFS(17),
 132     // 18..31 are implied in range usage
 133     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 134     fpu_stateH_end,
 135     r15_off, r15H_off,
 136     r14_off, r14H_off,
 137     r13_off, r13H_off,
 138     r12_off, r12H_off,
 139     r11_off, r11H_off,
 140     r10_off, r10H_off,
 141     r9_off,  r9H_off,
 142     r8_off,  r8H_off,
 143     rdi_off, rdiH_off,
 144     rsi_off, rsiH_off,
 145     ignore_off, ignoreH_off,  // extra copy of rbp
 146     rsp_off, rspH_off,
 147     rbx_off, rbxH_off,
 148     rdx_off, rdxH_off,
 149     rcx_off, rcxH_off,
 150     rax_off, raxH_off,
 151     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 152     align_off, alignH_off,
 153     flags_off, flagsH_off,
 154     // The frame sender code expects that rbp will be in the "natural" place and
 155     // will override any oopMap setting for it. We must therefore force the layout
 156     // so that it agrees with the frame sender code.
 157     rbp_off, rbpH_off,        // copy of rbp we will restore
 158     return_off, returnH_off,  // slot for return address
 159     reg_save_size             // size in compiler stack slots
 160   };
 161 
 162  public:
 163   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 164   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 165 
 166   // Offsets into the register save area
 167   // Used by deoptimization when it is managing result register
 168   // values on its own
 169 
 170   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 171   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 172   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 173   static int r15_offset_in_bytes(void)    { return BytesPerInt * r15_off; }
 174   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 175   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 176 
 177   // During deoptimization only the result registers need to be restored,
 178   // all the other values have already been extracted.
 179   static void restore_result_registers(MacroAssembler* masm);
 180 };
 181 
 182 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 183   int off = 0;
 184   int num_xmm_regs = XMMRegister::available_xmm_registers();
 185 #if COMPILER2_OR_JVMCI
 186   if (save_wide_vectors && UseAVX == 0) {
 187     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 188   }
 189   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 190 #else
 191   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 192 #endif
 193 
 194   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 195   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 196   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 197   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 198   // CodeBlob frame size is in words.
 199   int frame_size_in_words = frame_size_in_bytes / wordSize;
 200   *total_frame_words = frame_size_in_words;
 201 
 202   // Save registers, fpu state, and flags.
 203   // We assume caller has already pushed the return address onto the
 204   // stack, so rsp is 8-byte aligned here.
 205   // We push rpb twice in this sequence because we want the real rbp
 206   // to be under the return like a normal enter.
 207 
 208   __ enter();          // rsp becomes 16-byte aligned here
 209   __ pushf();
 210   // Make sure rsp stays 16-byte aligned
 211   __ subq(rsp, 8);
 212   // Push CPU state in multiple of 16 bytes
 213   __ save_legacy_gprs();
 214   __ push_FPU_state();
 215 
 216 
 217   // push cpu state handles this on EVEX enabled targets
 218   if (save_wide_vectors) {
 219     // Save upper half of YMM registers(0..15)
 220     int base_addr = XSAVE_AREA_YMM_BEGIN;
 221     for (int n = 0; n < 16; n++) {
 222       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 223     }
 224     if (VM_Version::supports_evex()) {
 225       // Save upper half of ZMM registers(0..15)
 226       base_addr = XSAVE_AREA_ZMM_BEGIN;
 227       for (int n = 0; n < 16; n++) {
 228         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 229       }
 230       // Save full ZMM registers(16..num_xmm_regs)
 231       base_addr = XSAVE_AREA_UPPERBANK;
 232       off = 0;
 233       int vector_len = Assembler::AVX_512bit;
 234       for (int n = 16; n < num_xmm_regs; n++) {
 235         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 236       }
 237 #if COMPILER2_OR_JVMCI
 238       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 239       off = 0;
 240       for(int n = 0; n < KRegister::number_of_registers; n++) {
 241         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 242       }
 243 #endif
 244     }
 245   } else {
 246     if (VM_Version::supports_evex()) {
 247       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 248       int base_addr = XSAVE_AREA_UPPERBANK;
 249       off = 0;
 250       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 251       for (int n = 16; n < num_xmm_regs; n++) {
 252         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 253       }
 254 #if COMPILER2_OR_JVMCI
 255       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 256       off = 0;
 257       for(int n = 0; n < KRegister::number_of_registers; n++) {
 258         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 259       }
 260 #endif
 261     }
 262   }
 263 
 264 #if COMPILER2_OR_JVMCI
 265   if (UseAPX) {
 266       int base_addr = XSAVE_AREA_EGPRS;
 267       off = 0;
 268       for (int n = 16; n < Register::number_of_registers; n++) {
 269         __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
 270       }
 271   }
 272 #endif
 273 
 274   __ vzeroupper();
 275   if (frame::arg_reg_save_area_bytes != 0) {
 276     // Allocate argument register save area
 277     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 278   }
 279 
 280   // Set an oopmap for the call site.  This oopmap will map all
 281   // oop-registers and debug-info registers as callee-saved.  This
 282   // will allow deoptimization at this safepoint to find all possible
 283   // debug-info recordings, as well as let GC find all oops.
 284 
 285   OopMapSet *oop_maps = new OopMapSet();
 286   OopMap* map = new OopMap(frame_size_in_slots, 0);
 287 
 288 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 289 
 290   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 291   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 292   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 293   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 294   // rbp location is known implicitly by the frame sender code, needs no oopmap
 295   // and the location where rbp was saved by is ignored
 296   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 297   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 298   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 299   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 300   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 301   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 302   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 303   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 304   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 305   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 306 
 307   if (UseAPX) {
 308     map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
 309     map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
 310     map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
 311     map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
 312     map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
 313     map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
 314     map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
 315     map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
 316     map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
 317     map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
 318     map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
 319     map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
 320     map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
 321     map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
 322     map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
 323     map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
 324   }
 325   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 326   // on EVEX enabled targets, we get it included in the xsave area
 327   off = xmm0_off;
 328   int delta = xmm1_off - off;
 329   for (int n = 0; n < 16; n++) {
 330     XMMRegister xmm_name = as_XMMRegister(n);
 331     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 332     off += delta;
 333   }
 334   if (UseAVX > 2) {
 335     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 336     off = zmm16_off;
 337     delta = zmm17_off - off;
 338     for (int n = 16; n < num_xmm_regs; n++) {
 339       XMMRegister zmm_name = as_XMMRegister(n);
 340       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 341       off += delta;
 342     }
 343   }
 344 
 345 #if COMPILER2_OR_JVMCI
 346   if (save_wide_vectors) {
 347     // Save upper half of YMM registers(0..15)
 348     off = ymm0_off;
 349     delta = ymm1_off - ymm0_off;
 350     for (int n = 0; n < 16; n++) {
 351       XMMRegister ymm_name = as_XMMRegister(n);
 352       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 353       off += delta;
 354     }
 355     if (VM_Version::supports_evex()) {
 356       // Save upper half of ZMM registers(0..15)
 357       off = zmm0_off;
 358       delta = zmm1_off - zmm0_off;
 359       for (int n = 0; n < 16; n++) {
 360         XMMRegister zmm_name = as_XMMRegister(n);
 361         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 362         off += delta;
 363       }
 364     }
 365   }
 366 #endif // COMPILER2_OR_JVMCI
 367 
 368   // %%% These should all be a waste but we'll keep things as they were for now
 369   if (true) {
 370     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 371     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 372     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 373     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 374     // rbp location is known implicitly by the frame sender code, needs no oopmap
 375     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 376     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 377     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 378     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 379     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 380     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 381     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 382     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 383     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 384     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 385     if (UseAPX) {
 386       map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
 387       map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
 388       map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
 389       map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
 390       map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
 391       map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
 392       map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
 393       map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
 394       map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
 395       map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
 396       map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
 397       map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
 398       map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
 399       map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
 400       map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
 401       map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
 402     }
 403     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 404     // on EVEX enabled targets, we get it included in the xsave area
 405     off = xmm0H_off;
 406     delta = xmm1H_off - off;
 407     for (int n = 0; n < 16; n++) {
 408       XMMRegister xmm_name = as_XMMRegister(n);
 409       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 410       off += delta;
 411     }
 412     if (UseAVX > 2) {
 413       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 414       off = zmm16H_off;
 415       delta = zmm17H_off - off;
 416       for (int n = 16; n < num_xmm_regs; n++) {
 417         XMMRegister zmm_name = as_XMMRegister(n);
 418         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 419         off += delta;
 420       }
 421     }
 422   }
 423 
 424   return map;
 425 }
 426 
 427 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 428   int num_xmm_regs = XMMRegister::available_xmm_registers();
 429   if (frame::arg_reg_save_area_bytes != 0) {
 430     // Pop arg register save area
 431     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 432   }
 433 
 434 #if COMPILER2_OR_JVMCI
 435   if (restore_wide_vectors) {
 436     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 437     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 438   }
 439 #else
 440   assert(!restore_wide_vectors, "vectors are generated only by C2");
 441 #endif
 442 
 443   __ vzeroupper();
 444 
 445   // On EVEX enabled targets everything is handled in pop fpu state
 446   if (restore_wide_vectors) {
 447     // Restore upper half of YMM registers (0..15)
 448     int base_addr = XSAVE_AREA_YMM_BEGIN;
 449     for (int n = 0; n < 16; n++) {
 450       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 451     }
 452     if (VM_Version::supports_evex()) {
 453       // Restore upper half of ZMM registers (0..15)
 454       base_addr = XSAVE_AREA_ZMM_BEGIN;
 455       for (int n = 0; n < 16; n++) {
 456         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 457       }
 458       // Restore full ZMM registers(16..num_xmm_regs)
 459       base_addr = XSAVE_AREA_UPPERBANK;
 460       int vector_len = Assembler::AVX_512bit;
 461       int off = 0;
 462       for (int n = 16; n < num_xmm_regs; n++) {
 463         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 464       }
 465 #if COMPILER2_OR_JVMCI
 466       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 467       off = 0;
 468       for (int n = 0; n < KRegister::number_of_registers; n++) {
 469         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 470       }
 471 #endif
 472     }
 473   } else {
 474     if (VM_Version::supports_evex()) {
 475       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 476       int base_addr = XSAVE_AREA_UPPERBANK;
 477       int off = 0;
 478       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 479       for (int n = 16; n < num_xmm_regs; n++) {
 480         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 481       }
 482 #if COMPILER2_OR_JVMCI
 483       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 484       off = 0;
 485       for (int n = 0; n < KRegister::number_of_registers; n++) {
 486         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 487       }
 488 #endif
 489     }
 490   }
 491 
 492 #if COMPILER2_OR_JVMCI
 493   if (UseAPX) {
 494     int base_addr = XSAVE_AREA_EGPRS;
 495     int off = 0;
 496     for (int n = 16; n < Register::number_of_registers; n++) {
 497       __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
 498     }
 499   }
 500 #endif
 501 
 502   // Recover CPU state
 503   __ pop_FPU_state();
 504   __ restore_legacy_gprs();
 505   __ addq(rsp, 8);
 506   __ popf();
 507   // Get the rbp described implicitly by the calling convention (no oopMap)
 508   __ pop(rbp);
 509 }
 510 
 511 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 512 
 513   // Just restore result register. Only used by deoptimization. By
 514   // now any callee save register that needs to be restored to a c2
 515   // caller of the deoptee has been extracted into the vframeArray
 516   // and will be stuffed into the c2i adapter we create for later
 517   // restoration so only result registers need to be restored here.
 518 
 519   // Restore fp result register
 520   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 521   // Restore integer result register
 522   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 523   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 524 
 525   // Pop all of the register save are off the stack except the return address
 526   __ addptr(rsp, return_offset_in_bytes());
 527 }
 528 
 529 // Is vector's size (in bytes) bigger than a size saved by default?
 530 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 531 bool SharedRuntime::is_wide_vector(int size) {
 532   return size > 16;
 533 }
 534 
 535 // ---------------------------------------------------------------------------
 536 // Read the array of BasicTypes from a signature, and compute where the
 537 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 538 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 539 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 540 // as framesizes are fixed.
 541 // VMRegImpl::stack0 refers to the first slot 0(sp).
 542 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 543 // Register up to Register::number_of_registers are the 64-bit
 544 // integer registers.
 545 
 546 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 547 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 548 // units regardless of build. Of course for i486 there is no 64 bit build
 549 
 550 // The Java calling convention is a "shifted" version of the C ABI.
 551 // By skipping the first C ABI register we can call non-static jni methods
 552 // with small numbers of arguments without having to shuffle the arguments
 553 // at all. Since we control the java ABI we ought to at least get some
 554 // advantage out of it.
 555 
 556 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 557                                            VMRegPair *regs,
 558                                            int total_args_passed) {
 559 
 560   // Create the mapping between argument positions and
 561   // registers.
 562   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 563     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 564   };
 565   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 566     j_farg0, j_farg1, j_farg2, j_farg3,
 567     j_farg4, j_farg5, j_farg6, j_farg7
 568   };
 569 
 570 
 571   uint int_args = 0;
 572   uint fp_args = 0;
 573   uint stk_args = 0;
 574 
 575   for (int i = 0; i < total_args_passed; i++) {
 576     switch (sig_bt[i]) {
 577     case T_BOOLEAN:
 578     case T_CHAR:
 579     case T_BYTE:
 580     case T_SHORT:
 581     case T_INT:
 582       if (int_args < Argument::n_int_register_parameters_j) {
 583         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 584       } else {
 585         stk_args = align_up(stk_args, 2);
 586         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 587         stk_args += 1;
 588       }
 589       break;
 590     case T_VOID:
 591       // halves of T_LONG or T_DOUBLE
 592       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 593       regs[i].set_bad();
 594       break;
 595     case T_LONG:
 596       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 597       // fall through
 598     case T_OBJECT:
 599     case T_ARRAY:
 600     case T_ADDRESS:
 601       if (int_args < Argument::n_int_register_parameters_j) {
 602         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 603       } else {
 604         stk_args = align_up(stk_args, 2);
 605         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 606         stk_args += 2;
 607       }
 608       break;
 609     case T_FLOAT:
 610       if (fp_args < Argument::n_float_register_parameters_j) {
 611         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 612       } else {
 613         stk_args = align_up(stk_args, 2);
 614         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 615         stk_args += 1;
 616       }
 617       break;
 618     case T_DOUBLE:
 619       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 620       if (fp_args < Argument::n_float_register_parameters_j) {
 621         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 622       } else {
 623         stk_args = align_up(stk_args, 2);
 624         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 625         stk_args += 2;
 626       }
 627       break;
 628     default:
 629       ShouldNotReachHere();
 630       break;
 631     }
 632   }
 633 
 634   return stk_args;
 635 }
 636 
 637 // Same as java_calling_convention() but for multiple return
 638 // values. There's no way to store them on the stack so if we don't
 639 // have enough registers, multiple values can't be returned.
 640 const uint SharedRuntime::java_return_convention_max_int = Argument::n_int_register_parameters_j+1;
 641 const uint SharedRuntime::java_return_convention_max_float = Argument::n_float_register_parameters_j;
 642 int SharedRuntime::java_return_convention(const BasicType *sig_bt,
 643                                           VMRegPair *regs,
 644                                           int total_args_passed) {
 645   // Create the mapping between argument positions and
 646   // registers.
 647   static const Register INT_ArgReg[java_return_convention_max_int] = {
 648     rax, j_rarg5, j_rarg4, j_rarg3, j_rarg2, j_rarg1, j_rarg0
 649   };
 650   static const XMMRegister FP_ArgReg[java_return_convention_max_float] = {
 651     j_farg0, j_farg1, j_farg2, j_farg3,
 652     j_farg4, j_farg5, j_farg6, j_farg7
 653   };
 654 
 655 
 656   uint int_args = 0;
 657   uint fp_args = 0;
 658 
 659   for (int i = 0; i < total_args_passed; i++) {
 660     switch (sig_bt[i]) {
 661     case T_BOOLEAN:
 662     case T_CHAR:
 663     case T_BYTE:
 664     case T_SHORT:
 665     case T_INT:
 666       if (int_args < Argument::n_int_register_parameters_j+1) {
 667         regs[i].set1(INT_ArgReg[int_args]->as_VMReg());
 668         int_args++;
 669       } else {
 670         return -1;
 671       }
 672       break;
 673     case T_VOID:
 674       // halves of T_LONG or T_DOUBLE
 675       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 676       regs[i].set_bad();
 677       break;
 678     case T_LONG:
 679       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 680       // fall through
 681     case T_OBJECT:
 682     case T_ARRAY:
 683     case T_ADDRESS:
 684     case T_METADATA:
 685       if (int_args < Argument::n_int_register_parameters_j+1) {
 686         regs[i].set2(INT_ArgReg[int_args]->as_VMReg());
 687         int_args++;
 688       } else {
 689         return -1;
 690       }
 691       break;
 692     case T_FLOAT:
 693       if (fp_args < Argument::n_float_register_parameters_j) {
 694         regs[i].set1(FP_ArgReg[fp_args]->as_VMReg());
 695         fp_args++;
 696       } else {
 697         return -1;
 698       }
 699       break;
 700     case T_DOUBLE:
 701       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 702       if (fp_args < Argument::n_float_register_parameters_j) {
 703         regs[i].set2(FP_ArgReg[fp_args]->as_VMReg());
 704         fp_args++;
 705       } else {
 706         return -1;
 707       }
 708       break;
 709     default:
 710       ShouldNotReachHere();
 711       break;
 712     }
 713   }
 714 
 715   return int_args + fp_args;
 716 }
 717 
 718 // Patch the callers callsite with entry to compiled code if it exists.
 719 static void patch_callers_callsite(MacroAssembler *masm) {
 720   Label L;
 721   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 722   __ jcc(Assembler::equal, L);
 723 
 724   // Save the current stack pointer
 725   __ mov(r13, rsp);
 726   // Schedule the branch target address early.
 727   // Call into the VM to patch the caller, then jump to compiled callee
 728   // rax isn't live so capture return address while we easily can
 729   __ movptr(rax, Address(rsp, 0));
 730 
 731   // align stack so push_CPU_state doesn't fault
 732   __ andptr(rsp, -(StackAlignmentInBytes));
 733   __ push_CPU_state();
 734   __ vzeroupper();
 735   // VM needs caller's callsite
 736   // VM needs target method
 737   // This needs to be a long call since we will relocate this adapter to
 738   // the codeBuffer and it may not reach
 739 
 740   // Allocate argument register save area
 741   if (frame::arg_reg_save_area_bytes != 0) {
 742     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 743   }
 744   __ mov(c_rarg0, rbx);
 745   __ mov(c_rarg1, rax);
 746   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 747 
 748   // De-allocate argument register save area
 749   if (frame::arg_reg_save_area_bytes != 0) {
 750     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 751   }
 752 
 753   __ vzeroupper();
 754   __ pop_CPU_state();
 755   // restore sp
 756   __ mov(rsp, r13);
 757   __ bind(L);
 758 }
 759 
 760 // For each inline type argument, sig includes the list of fields of
 761 // the inline type. This utility function computes the number of
 762 // arguments for the call if inline types are passed by reference (the
 763 // calling convention the interpreter expects).
 764 static int compute_total_args_passed_int(const GrowableArray<SigEntry>* sig_extended) {
 765   int total_args_passed = 0;
 766   if (InlineTypePassFieldsAsArgs) {
 767     for (int i = 0; i < sig_extended->length(); i++) {
 768       BasicType bt = sig_extended->at(i)._bt;
 769       if (bt == T_METADATA) {
 770         // In sig_extended, an inline type argument starts with:
 771         // T_METADATA, followed by the types of the fields of the
 772         // inline type and T_VOID to mark the end of the value
 773         // type. Inline types are flattened so, for instance, in the
 774         // case of an inline type with an int field and an inline type
 775         // field that itself has 2 fields, an int and a long:
 776         // T_METADATA T_INT T_METADATA T_INT T_LONG T_VOID (second
 777         // slot for the T_LONG) T_VOID (inner inline type) T_VOID
 778         // (outer inline type)
 779         total_args_passed++;
 780         int vt = 1;
 781         do {
 782           i++;
 783           BasicType bt = sig_extended->at(i)._bt;
 784           BasicType prev_bt = sig_extended->at(i-1)._bt;
 785           if (bt == T_METADATA) {
 786             vt++;
 787           } else if (bt == T_VOID &&
 788                      prev_bt != T_LONG &&
 789                      prev_bt != T_DOUBLE) {
 790             vt--;
 791           }
 792         } while (vt != 0);
 793       } else {
 794         total_args_passed++;
 795       }
 796     }
 797   } else {
 798     total_args_passed = sig_extended->length();
 799   }
 800   return total_args_passed;
 801 }
 802 
 803 
 804 static void gen_c2i_adapter_helper(MacroAssembler* masm,
 805                                    BasicType bt,
 806                                    BasicType prev_bt,
 807                                    size_t size_in_bytes,
 808                                    const VMRegPair& reg_pair,
 809                                    const Address& to,
 810                                    int extraspace,
 811                                    bool is_oop) {
 812   if (bt == T_VOID) {
 813     assert(prev_bt == T_LONG || prev_bt == T_DOUBLE, "missing half");
 814     return;
 815   }
 816 
 817   // Say 4 args:
 818   // i   st_off
 819   // 0   32 T_LONG
 820   // 1   24 T_VOID
 821   // 2   16 T_OBJECT
 822   // 3    8 T_BOOL
 823   // -    0 return address
 824   //
 825   // However to make thing extra confusing. Because we can fit a long/double in
 826   // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 827   // leaves one slot empty and only stores to a single slot. In this case the
 828   // slot that is occupied is the T_VOID slot. See I said it was confusing.
 829 
 830   bool wide = (size_in_bytes == wordSize);
 831   VMReg r_1 = reg_pair.first();
 832   VMReg r_2 = reg_pair.second();
 833   assert(r_2->is_valid() == wide, "invalid size");
 834   if (!r_1->is_valid()) {
 835     assert(!r_2->is_valid(), "must be invalid");
 836     return;
 837   }
 838 
 839   if (!r_1->is_XMMRegister()) {
 840     Register val = rax;
 841     if (r_1->is_stack()) {
 842       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 843       __ load_sized_value(val, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false);
 844     } else {
 845       val = r_1->as_Register();
 846     }
 847     assert_different_registers(to.base(), val, rscratch1);
 848     if (is_oop) {
 849       __ push(r13);
 850       __ push(rbx);
 851       __ store_heap_oop(to, val, rscratch1, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
 852       __ pop(rbx);
 853       __ pop(r13);
 854     } else {
 855       __ store_sized_value(to, val, size_in_bytes);
 856     }
 857   } else {
 858     if (wide) {
 859       __ movdbl(to, r_1->as_XMMRegister());
 860     } else {
 861       __ movflt(to, r_1->as_XMMRegister());
 862     }
 863   }
 864 }
 865 
 866 static void gen_c2i_adapter(MacroAssembler *masm,
 867                             const GrowableArray<SigEntry>* sig_extended,
 868                             const VMRegPair *regs,
 869                             bool requires_clinit_barrier,
 870                             address& c2i_no_clinit_check_entry,
 871                             Label& skip_fixup,
 872                             address start,
 873                             OopMapSet* oop_maps,
 874                             int& frame_complete,
 875                             int& frame_size_in_words,
 876                             bool alloc_inline_receiver) {
 877   if (requires_clinit_barrier && VM_Version::supports_fast_class_init_checks()) {
 878     Label L_skip_barrier;
 879     Register method = rbx;
 880 
 881     { // Bypass the barrier for non-static methods
 882       Register flags = rscratch1;
 883       __ load_unsigned_short(flags, Address(method, Method::access_flags_offset()));
 884       __ testl(flags, JVM_ACC_STATIC);
 885       __ jcc(Assembler::zero, L_skip_barrier); // non-static
 886     }
 887 
 888     Register klass = rscratch1;
 889     __ load_method_holder(klass, method);
 890     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
 891 
 892     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
 893 
 894     __ bind(L_skip_barrier);
 895     c2i_no_clinit_check_entry = __ pc();
 896   }
 897 
 898   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 899   bs->c2i_entry_barrier(masm);
 900 
 901   // Before we get into the guts of the C2I adapter, see if we should be here
 902   // at all.  We've come from compiled code and are attempting to jump to the
 903   // interpreter, which means the caller made a static call to get here
 904   // (vcalls always get a compiled target if there is one).  Check for a
 905   // compiled target.  If there is one, we need to patch the caller's call.
 906   patch_callers_callsite(masm);
 907 
 908   __ bind(skip_fixup);
 909 
 910   if (InlineTypePassFieldsAsArgs) {
 911     // Is there an inline type argument?
 912     bool has_inline_argument = false;
 913     for (int i = 0; i < sig_extended->length() && !has_inline_argument; i++) {
 914       has_inline_argument = (sig_extended->at(i)._bt == T_METADATA);
 915     }
 916     if (has_inline_argument) {
 917       // There is at least an inline type argument: we're coming from
 918       // compiled code so we have no buffers to back the inline types.
 919       // Allocate the buffers here with a runtime call.
 920       OopMap* map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ false);
 921 
 922       frame_complete = __ offset();
 923 
 924       __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
 925 
 926       __ mov(c_rarg0, r15_thread);
 927       __ mov(c_rarg1, rbx);
 928       __ mov64(c_rarg2, (int64_t)alloc_inline_receiver);
 929       __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::allocate_inline_types)));
 930 
 931       oop_maps->add_gc_map((int)(__ pc() - start), map);
 932       __ reset_last_Java_frame(false);
 933 
 934       RegisterSaver::restore_live_registers(masm);
 935 
 936       Label no_exception;
 937       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
 938       __ jcc(Assembler::equal, no_exception);
 939 
 940       __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
 941       __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 942       __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
 943 
 944       __ bind(no_exception);
 945 
 946       // We get an array of objects from the runtime call
 947       __ get_vm_result(rscratch2, r15_thread); // Use rscratch2 (r11) as temporary because rscratch1 (r10) is trashed by movptr()
 948       __ get_vm_result_2(rbx, r15_thread); // TODO: required to keep the callee Method live?
 949     }
 950   }
 951 
 952   // Since all args are passed on the stack, total_args_passed *
 953   // Interpreter::stackElementSize is the space we need.
 954   int total_args_passed = compute_total_args_passed_int(sig_extended);
 955   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 956 
 957   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 958 
 959   // stack is aligned, keep it that way
 960   // This is not currently needed or enforced by the interpreter, but
 961   // we might as well conform to the ABI.
 962   extraspace = align_up(extraspace, 2*wordSize);
 963 
 964   // set senderSP value
 965   __ lea(r13, Address(rsp, wordSize));
 966 
 967 #ifdef ASSERT
 968   __ check_stack_alignment(r13, "sender stack not aligned");
 969 #endif
 970   if (extraspace > 0) {
 971     // Pop the return address
 972     __ pop(rax);
 973 
 974     __ subptr(rsp, extraspace);
 975 
 976     // Push the return address
 977     __ push(rax);
 978 
 979     // Account for the return address location since we store it first rather
 980     // than hold it in a register across all the shuffling
 981     extraspace += wordSize;
 982   }
 983 
 984 #ifdef ASSERT
 985   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 986 #endif
 987 
 988   // Now write the args into the outgoing interpreter space
 989 
 990   // next_arg_comp is the next argument from the compiler point of
 991   // view (inline type fields are passed in registers/on the stack). In
 992   // sig_extended, an inline type argument starts with: T_METADATA,
 993   // followed by the types of the fields of the inline type and T_VOID
 994   // to mark the end of the inline type. ignored counts the number of
 995   // T_METADATA/T_VOID. next_vt_arg is the next inline type argument:
 996   // used to get the buffer for that argument from the pool of buffers
 997   // we allocated above and want to pass to the
 998   // interpreter. next_arg_int is the next argument from the
 999   // interpreter point of view (inline types are passed by reference).
1000   for (int next_arg_comp = 0, ignored = 0, next_vt_arg = 0, next_arg_int = 0;
1001        next_arg_comp < sig_extended->length(); next_arg_comp++) {
1002     assert(ignored <= next_arg_comp, "shouldn't skip over more slots than there are arguments");
1003     assert(next_arg_int <= total_args_passed, "more arguments for the interpreter than expected?");
1004     BasicType bt = sig_extended->at(next_arg_comp)._bt;
1005     int st_off = (total_args_passed - next_arg_int) * Interpreter::stackElementSize;
1006     if (!InlineTypePassFieldsAsArgs || bt != T_METADATA) {
1007       int next_off = st_off - Interpreter::stackElementSize;
1008       const int offset = (bt == T_LONG || bt == T_DOUBLE) ? next_off : st_off;
1009       const VMRegPair reg_pair = regs[next_arg_comp-ignored];
1010       size_t size_in_bytes = reg_pair.second()->is_valid() ? 8 : 4;
1011       gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
1012                              size_in_bytes, reg_pair, Address(rsp, offset), extraspace, false);
1013       next_arg_int++;
1014 #ifdef ASSERT
1015       if (bt == T_LONG || bt == T_DOUBLE) {
1016         // Overwrite the unused slot with known junk
1017         __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
1018         __ movptr(Address(rsp, st_off), rax);
1019       }
1020 #endif /* ASSERT */
1021     } else {
1022       ignored++;
1023       // get the buffer from the just allocated pool of buffers
1024       int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + next_vt_arg * type2aelembytes(T_OBJECT);
1025       __ load_heap_oop(r14, Address(rscratch2, index));
1026       next_vt_arg++; next_arg_int++;
1027       int vt = 1;
1028       // write fields we get from compiled code in registers/stack
1029       // slots to the buffer: we know we are done with that inline type
1030       // argument when we hit the T_VOID that acts as an end of inline
1031       // type delimiter for this inline type. Inline types are flattened
1032       // so we might encounter embedded inline types. Each entry in
1033       // sig_extended contains a field offset in the buffer.
1034       Label L_null;
1035       do {
1036         next_arg_comp++;
1037         BasicType bt = sig_extended->at(next_arg_comp)._bt;
1038         BasicType prev_bt = sig_extended->at(next_arg_comp-1)._bt;
1039         if (bt == T_METADATA) {
1040           vt++;
1041           ignored++;
1042         } else if (bt == T_VOID &&
1043                    prev_bt != T_LONG &&
1044                    prev_bt != T_DOUBLE) {
1045           vt--;
1046           ignored++;
1047         } else {
1048           int off = sig_extended->at(next_arg_comp)._offset;
1049           if (off == -1) {
1050             // Nullable inline type argument, emit null check
1051             VMReg reg = regs[next_arg_comp-ignored].first();
1052             Label L_notNull;
1053             if (reg->is_stack()) {
1054               int ld_off = reg->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
1055               __ testb(Address(rsp, ld_off), 1);
1056             } else {
1057               __ testb(reg->as_Register(), 1);
1058             }
1059             __ jcc(Assembler::notZero, L_notNull);
1060             __ movptr(Address(rsp, st_off), 0);
1061             __ jmp(L_null);
1062             __ bind(L_notNull);
1063             continue;
1064           }
1065           assert(off > 0, "offset in object should be positive");
1066           size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize;
1067           bool is_oop = is_reference_type(bt);
1068           gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
1069                                  size_in_bytes, regs[next_arg_comp-ignored], Address(r14, off), extraspace, is_oop);
1070         }
1071       } while (vt != 0);
1072       // pass the buffer to the interpreter
1073       __ movptr(Address(rsp, st_off), r14);
1074       __ bind(L_null);
1075     }
1076   }
1077 
1078   // Schedule the branch target address early.
1079   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
1080   __ jmp(rcx);
1081 }
1082 
1083 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
1084                         address code_start, address code_end,
1085                         Label& L_ok) {
1086   Label L_fail;
1087   __ lea(temp_reg, AddressLiteral(code_start, relocInfo::none));
1088   __ cmpptr(pc_reg, temp_reg);
1089   __ jcc(Assembler::belowEqual, L_fail);
1090   __ lea(temp_reg, AddressLiteral(code_end, relocInfo::none));
1091   __ cmpptr(pc_reg, temp_reg);
1092   __ jcc(Assembler::below, L_ok);
1093   __ bind(L_fail);
1094 }
1095 
1096 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
1097                                     int comp_args_on_stack,
1098                                     const GrowableArray<SigEntry>* sig,
1099                                     const VMRegPair *regs) {
1100 
1101   // Note: r13 contains the senderSP on entry. We must preserve it since
1102   // we may do a i2c -> c2i transition if we lose a race where compiled
1103   // code goes non-entrant while we get args ready.
1104   // In addition we use r13 to locate all the interpreter args as
1105   // we must align the stack to 16 bytes on an i2c entry else we
1106   // lose alignment we expect in all compiled code and register
1107   // save code can segv when fxsave instructions find improperly
1108   // aligned stack pointer.
1109 
1110   // Adapters can be frameless because they do not require the caller
1111   // to perform additional cleanup work, such as correcting the stack pointer.
1112   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
1113   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
1114   // even if a callee has modified the stack pointer.
1115   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
1116   // routinely repairs its caller's stack pointer (from sender_sp, which is set
1117   // up via the senderSP register).
1118   // In other words, if *either* the caller or callee is interpreted, we can
1119   // get the stack pointer repaired after a call.
1120   // This is why c2i and i2c adapters cannot be indefinitely composed.
1121   // In particular, if a c2i adapter were to somehow call an i2c adapter,
1122   // both caller and callee would be compiled methods, and neither would
1123   // clean up the stack pointer changes performed by the two adapters.
1124   // If this happens, control eventually transfers back to the compiled
1125   // caller, but with an uncorrected stack, causing delayed havoc.
1126 
1127   if (VerifyAdapterCalls &&
1128       (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) {
1129     // So, let's test for cascading c2i/i2c adapters right now.
1130     //  assert(Interpreter::contains($return_addr) ||
1131     //         StubRoutines::contains($return_addr),
1132     //         "i2c adapter must return to an interpreter frame");
1133     __ block_comment("verify_i2c { ");
1134     // Pick up the return address
1135     __ movptr(rax, Address(rsp, 0));
1136     Label L_ok;
1137     if (Interpreter::code() != nullptr) {
1138       range_check(masm, rax, r11,
1139                   Interpreter::code()->code_start(),
1140                   Interpreter::code()->code_end(),
1141                   L_ok);
1142     }
1143     if (StubRoutines::initial_stubs_code() != nullptr) {
1144       range_check(masm, rax, r11,
1145                   StubRoutines::initial_stubs_code()->code_begin(),
1146                   StubRoutines::initial_stubs_code()->code_end(),
1147                   L_ok);
1148     }
1149     if (StubRoutines::final_stubs_code() != nullptr) {
1150       range_check(masm, rax, r11,
1151                   StubRoutines::final_stubs_code()->code_begin(),
1152                   StubRoutines::final_stubs_code()->code_end(),
1153                   L_ok);
1154     }
1155     const char* msg = "i2c adapter must return to an interpreter frame";
1156     __ block_comment(msg);
1157     __ stop(msg);
1158     __ bind(L_ok);
1159     __ block_comment("} verify_i2ce ");
1160   }
1161 
1162   // Must preserve original SP for loading incoming arguments because
1163   // we need to align the outgoing SP for compiled code.
1164   __ movptr(r11, rsp);
1165 
1166   // Pick up the return address
1167   __ pop(rax);
1168 
1169   // Convert 4-byte c2 stack slots to words.
1170   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
1171 
1172   if (comp_args_on_stack) {
1173     __ subptr(rsp, comp_words_on_stack * wordSize);
1174   }
1175 
1176   // Ensure compiled code always sees stack at proper alignment
1177   __ andptr(rsp, -16);
1178 
1179   // push the return address and misalign the stack that youngest frame always sees
1180   // as far as the placement of the call instruction
1181   __ push(rax);
1182 
1183   // Put saved SP in another register
1184   const Register saved_sp = rax;
1185   __ movptr(saved_sp, r11);
1186 
1187   // Will jump to the compiled code just as if compiled code was doing it.
1188   // Pre-load the register-jump target early, to schedule it better.
1189   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_inline_offset())));
1190 
1191 #if INCLUDE_JVMCI
1192   if (EnableJVMCI) {
1193     // check if this call should be routed towards a specific entry point
1194     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1195     Label no_alternative_target;
1196     __ jcc(Assembler::equal, no_alternative_target);
1197     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
1198     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1199     __ bind(no_alternative_target);
1200   }
1201 #endif // INCLUDE_JVMCI
1202 
1203   int total_args_passed = sig->length();
1204 
1205   // Now generate the shuffle code.  Pick up all register args and move the
1206   // rest through the floating point stack top.
1207   for (int i = 0; i < total_args_passed; i++) {
1208     BasicType bt = sig->at(i)._bt;
1209     if (bt == T_VOID) {
1210       // Longs and doubles are passed in native word order, but misaligned
1211       // in the 32-bit build.
1212       BasicType prev_bt = (i > 0) ? sig->at(i-1)._bt : T_ILLEGAL;
1213       assert(i > 0 && (prev_bt == T_LONG || prev_bt == T_DOUBLE), "missing half");
1214       continue;
1215     }
1216 
1217     // Pick up 0, 1 or 2 words from SP+offset.
1218 
1219     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
1220             "scrambled load targets?");
1221     // Load in argument order going down.
1222     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
1223     // Point to interpreter value (vs. tag)
1224     int next_off = ld_off - Interpreter::stackElementSize;
1225     //
1226     //
1227     //
1228     VMReg r_1 = regs[i].first();
1229     VMReg r_2 = regs[i].second();
1230     if (!r_1->is_valid()) {
1231       assert(!r_2->is_valid(), "");
1232       continue;
1233     }
1234     if (r_1->is_stack()) {
1235       // Convert stack slot to an SP offset (+ wordSize to account for return address )
1236       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
1237 
1238       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
1239       // and if we end up going thru a c2i because of a miss a reasonable value of r13
1240       // will be generated.
1241       if (!r_2->is_valid()) {
1242         // sign extend???
1243         __ movl(r13, Address(saved_sp, ld_off));
1244         __ movptr(Address(rsp, st_off), r13);
1245       } else {
1246         //
1247         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1248         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1249         // So we must adjust where to pick up the data to match the interpreter.
1250         //
1251         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
1252         // are accessed as negative so LSW is at LOW address
1253 
1254         // ld_off is MSW so get LSW
1255         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1256                            next_off : ld_off;
1257         __ movq(r13, Address(saved_sp, offset));
1258         // st_off is LSW (i.e. reg.first())
1259         __ movq(Address(rsp, st_off), r13);
1260       }
1261     } else if (r_1->is_Register()) {  // Register argument
1262       Register r = r_1->as_Register();
1263       assert(r != rax, "must be different");
1264       if (r_2->is_valid()) {
1265         //
1266         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1267         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1268         // So we must adjust where to pick up the data to match the interpreter.
1269 
1270         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1271                            next_off : ld_off;
1272 
1273         // this can be a misaligned move
1274         __ movq(r, Address(saved_sp, offset));
1275       } else {
1276         // sign extend and use a full word?
1277         __ movl(r, Address(saved_sp, ld_off));
1278       }
1279     } else {
1280       if (!r_2->is_valid()) {
1281         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
1282       } else {
1283         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
1284       }
1285     }
1286   }
1287 
1288   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
1289 
1290   // 6243940 We might end up in handle_wrong_method if
1291   // the callee is deoptimized as we race thru here. If that
1292   // happens we don't want to take a safepoint because the
1293   // caller frame will look interpreted and arguments are now
1294   // "compiled" so it is much better to make this transition
1295   // invisible to the stack walking code. Unfortunately if
1296   // we try and find the callee by normal means a safepoint
1297   // is possible. So we stash the desired callee in the thread
1298   // and the vm will find there should this case occur.
1299 
1300   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1301 
1302   // put Method* where a c2i would expect should we end up there
1303   // only needed because of c2 resolve stubs return Method* as a result in
1304   // rax
1305   __ mov(rax, rbx);
1306   __ jmp(r11);
1307 }
1308 
1309 static void gen_inline_cache_check(MacroAssembler *masm, Label& skip_fixup) {
1310   Register data = rax;
1311   __ ic_check(1 /* end_alignment */);
1312   __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1313 
1314   // Method might have been compiled since the call site was patched to
1315   // interpreted if that is the case treat it as a miss so we can get
1316   // the call site corrected.
1317   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1318   __ jcc(Assembler::equal, skip_fixup);
1319   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1320 }
1321 
1322 // ---------------------------------------------------------------
1323 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler* masm,
1324                                                             int comp_args_on_stack,
1325                                                             const GrowableArray<SigEntry>* sig,
1326                                                             const VMRegPair* regs,
1327                                                             const GrowableArray<SigEntry>* sig_cc,
1328                                                             const VMRegPair* regs_cc,
1329                                                             const GrowableArray<SigEntry>* sig_cc_ro,
1330                                                             const VMRegPair* regs_cc_ro,
1331                                                             AdapterFingerPrint* fingerprint,
1332                                                             AdapterBlob*& new_adapter,
1333                                                             bool allocate_code_blob) {
1334   address i2c_entry = __ pc();
1335   gen_i2c_adapter(masm, comp_args_on_stack, sig, regs);
1336 
1337   // -------------------------------------------------------------------------
1338   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1339   // to the interpreter.  The args start out packed in the compiled layout.  They
1340   // need to be unpacked into the interpreter layout.  This will almost always
1341   // require some stack space.  We grow the current (compiled) stack, then repack
1342   // the args.  We  finally end in a jump to the generic interpreter entry point.
1343   // On exit from the interpreter, the interpreter will restore our SP (lest the
1344   // compiled code, which relies solely on SP and not RBP, get sick).
1345 
1346   address c2i_unverified_entry        = __ pc();
1347   address c2i_unverified_inline_entry = __ pc();
1348   Label skip_fixup;
1349 
1350   gen_inline_cache_check(masm, skip_fixup);
1351 
1352   OopMapSet* oop_maps = new OopMapSet();
1353   int frame_complete = CodeOffsets::frame_never_safe;
1354   int frame_size_in_words = 0;
1355 
1356   // Scalarized c2i adapter with non-scalarized receiver (i.e., don't pack receiver)
1357   address c2i_no_clinit_check_entry = nullptr;
1358   address c2i_inline_ro_entry = __ pc();
1359   if (regs_cc != regs_cc_ro) {
1360     // No class init barrier needed because method is guaranteed to be non-static
1361     gen_c2i_adapter(masm, sig_cc_ro, regs_cc_ro, /* requires_clinit_barrier = */ false, c2i_no_clinit_check_entry,
1362                     skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1363     skip_fixup.reset();
1364   }
1365 
1366   // Scalarized c2i adapter
1367   address c2i_entry        = __ pc();
1368   address c2i_inline_entry = __ pc();
1369   gen_c2i_adapter(masm, sig_cc, regs_cc, /* requires_clinit_barrier = */ true, c2i_no_clinit_check_entry,
1370                   skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ true);
1371 
1372   // Non-scalarized c2i adapter
1373   if (regs != regs_cc) {
1374     c2i_unverified_inline_entry = __ pc();
1375     Label inline_entry_skip_fixup;
1376     gen_inline_cache_check(masm, inline_entry_skip_fixup);
1377 
1378     c2i_inline_entry = __ pc();
1379     gen_c2i_adapter(masm, sig, regs, /* requires_clinit_barrier = */ true, c2i_no_clinit_check_entry,
1380                     inline_entry_skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1381   }
1382 
1383   // The c2i adapters might safepoint and trigger a GC. The caller must make sure that
1384   // the GC knows about the location of oop argument locations passed to the c2i adapter.
1385   if (allocate_code_blob) {
1386     bool caller_must_gc_arguments = (regs != regs_cc);
1387     new_adapter = AdapterBlob::create(masm->code(), frame_complete, frame_size_in_words, oop_maps, caller_must_gc_arguments);
1388   }
1389 
1390   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_inline_entry, c2i_inline_ro_entry, c2i_unverified_entry, c2i_unverified_inline_entry, c2i_no_clinit_check_entry);
1391 }
1392 
1393 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1394                                          VMRegPair *regs,
1395                                          int total_args_passed) {
1396 
1397 // We return the amount of VMRegImpl stack slots we need to reserve for all
1398 // the arguments NOT counting out_preserve_stack_slots.
1399 
1400 // NOTE: These arrays will have to change when c1 is ported
1401 #ifdef _WIN64
1402     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1403       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1404     };
1405     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1406       c_farg0, c_farg1, c_farg2, c_farg3
1407     };
1408 #else
1409     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1410       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1411     };
1412     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1413       c_farg0, c_farg1, c_farg2, c_farg3,
1414       c_farg4, c_farg5, c_farg6, c_farg7
1415     };
1416 #endif // _WIN64
1417 
1418 
1419     uint int_args = 0;
1420     uint fp_args = 0;
1421     uint stk_args = 0; // inc by 2 each time
1422 
1423     for (int i = 0; i < total_args_passed; i++) {
1424       switch (sig_bt[i]) {
1425       case T_BOOLEAN:
1426       case T_CHAR:
1427       case T_BYTE:
1428       case T_SHORT:
1429       case T_INT:
1430         if (int_args < Argument::n_int_register_parameters_c) {
1431           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1432 #ifdef _WIN64
1433           fp_args++;
1434           // Allocate slots for callee to stuff register args the stack.
1435           stk_args += 2;
1436 #endif
1437         } else {
1438           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1439           stk_args += 2;
1440         }
1441         break;
1442       case T_LONG:
1443         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1444         // fall through
1445       case T_OBJECT:
1446       case T_ARRAY:
1447       case T_ADDRESS:
1448       case T_METADATA:
1449         if (int_args < Argument::n_int_register_parameters_c) {
1450           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1451 #ifdef _WIN64
1452           fp_args++;
1453           stk_args += 2;
1454 #endif
1455         } else {
1456           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1457           stk_args += 2;
1458         }
1459         break;
1460       case T_FLOAT:
1461         if (fp_args < Argument::n_float_register_parameters_c) {
1462           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1463 #ifdef _WIN64
1464           int_args++;
1465           // Allocate slots for callee to stuff register args the stack.
1466           stk_args += 2;
1467 #endif
1468         } else {
1469           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1470           stk_args += 2;
1471         }
1472         break;
1473       case T_DOUBLE:
1474         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1475         if (fp_args < Argument::n_float_register_parameters_c) {
1476           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1477 #ifdef _WIN64
1478           int_args++;
1479           // Allocate slots for callee to stuff register args the stack.
1480           stk_args += 2;
1481 #endif
1482         } else {
1483           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1484           stk_args += 2;
1485         }
1486         break;
1487       case T_VOID: // Halves of longs and doubles
1488         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1489         regs[i].set_bad();
1490         break;
1491       default:
1492         ShouldNotReachHere();
1493         break;
1494       }
1495     }
1496 #ifdef _WIN64
1497   // windows abi requires that we always allocate enough stack space
1498   // for 4 64bit registers to be stored down.
1499   if (stk_args < 8) {
1500     stk_args = 8;
1501   }
1502 #endif // _WIN64
1503 
1504   return stk_args;
1505 }
1506 
1507 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1508                                              uint num_bits,
1509                                              uint total_args_passed) {
1510   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1511          "only certain vector sizes are supported for now");
1512 
1513   static const XMMRegister VEC_ArgReg[32] = {
1514      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1515      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1516     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1517     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1518   };
1519 
1520   uint stk_args = 0;
1521   uint fp_args = 0;
1522 
1523   for (uint i = 0; i < total_args_passed; i++) {
1524     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1525     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1526     regs[i].set_pair(vmreg->next(next_val), vmreg);
1527   }
1528 
1529   return stk_args;
1530 }
1531 
1532 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1533   // We always ignore the frame_slots arg and just use the space just below frame pointer
1534   // which by this time is free to use
1535   switch (ret_type) {
1536   case T_FLOAT:
1537     __ movflt(Address(rbp, -wordSize), xmm0);
1538     break;
1539   case T_DOUBLE:
1540     __ movdbl(Address(rbp, -wordSize), xmm0);
1541     break;
1542   case T_VOID:  break;
1543   default: {
1544     __ movptr(Address(rbp, -wordSize), rax);
1545     }
1546   }
1547 }
1548 
1549 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1550   // We always ignore the frame_slots arg and just use the space just below frame pointer
1551   // which by this time is free to use
1552   switch (ret_type) {
1553   case T_FLOAT:
1554     __ movflt(xmm0, Address(rbp, -wordSize));
1555     break;
1556   case T_DOUBLE:
1557     __ movdbl(xmm0, Address(rbp, -wordSize));
1558     break;
1559   case T_VOID:  break;
1560   default: {
1561     __ movptr(rax, Address(rbp, -wordSize));
1562     }
1563   }
1564 }
1565 
1566 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1567     for ( int i = first_arg ; i < arg_count ; i++ ) {
1568       if (args[i].first()->is_Register()) {
1569         __ push(args[i].first()->as_Register());
1570       } else if (args[i].first()->is_XMMRegister()) {
1571         __ subptr(rsp, 2*wordSize);
1572         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1573       }
1574     }
1575 }
1576 
1577 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1578     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1579       if (args[i].first()->is_Register()) {
1580         __ pop(args[i].first()->as_Register());
1581       } else if (args[i].first()->is_XMMRegister()) {
1582         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1583         __ addptr(rsp, 2*wordSize);
1584       }
1585     }
1586 }
1587 
1588 static void verify_oop_args(MacroAssembler* masm,
1589                             const methodHandle& method,
1590                             const BasicType* sig_bt,
1591                             const VMRegPair* regs) {
1592   Register temp_reg = rbx;  // not part of any compiled calling seq
1593   if (VerifyOops) {
1594     for (int i = 0; i < method->size_of_parameters(); i++) {
1595       if (is_reference_type(sig_bt[i])) {
1596         VMReg r = regs[i].first();
1597         assert(r->is_valid(), "bad oop arg");
1598         if (r->is_stack()) {
1599           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1600           __ verify_oop(temp_reg);
1601         } else {
1602           __ verify_oop(r->as_Register());
1603         }
1604       }
1605     }
1606   }
1607 }
1608 
1609 static void check_continuation_enter_argument(VMReg actual_vmreg,
1610                                               Register expected_reg,
1611                                               const char* name) {
1612   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1613   assert(actual_vmreg->as_Register() == expected_reg,
1614          "%s is in unexpected register: %s instead of %s",
1615          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1616 }
1617 
1618 
1619 //---------------------------- continuation_enter_setup ---------------------------
1620 //
1621 // Arguments:
1622 //   None.
1623 //
1624 // Results:
1625 //   rsp: pointer to blank ContinuationEntry
1626 //
1627 // Kills:
1628 //   rax
1629 //
1630 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1631   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1632   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1633   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1634 
1635   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1636   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1637 
1638   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1639   OopMap* map = new OopMap(frame_size, 0);
1640 
1641   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1642   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1643   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1644 
1645   return map;
1646 }
1647 
1648 //---------------------------- fill_continuation_entry ---------------------------
1649 //
1650 // Arguments:
1651 //   rsp: pointer to blank Continuation entry
1652 //   reg_cont_obj: pointer to the continuation
1653 //   reg_flags: flags
1654 //
1655 // Results:
1656 //   rsp: pointer to filled out ContinuationEntry
1657 //
1658 // Kills:
1659 //   rax
1660 //
1661 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1662   assert_different_registers(rax, reg_cont_obj, reg_flags);
1663 #ifdef ASSERT
1664   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1665 #endif
1666   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1667   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1668   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1669   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1670   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1671 
1672   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1673   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1674   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1675   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1676 
1677   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1678   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1679 }
1680 
1681 //---------------------------- continuation_enter_cleanup ---------------------------
1682 //
1683 // Arguments:
1684 //   rsp: pointer to the ContinuationEntry
1685 //
1686 // Results:
1687 //   rsp: pointer to the spilled rbp in the entry frame
1688 //
1689 // Kills:
1690 //   rbx
1691 //
1692 static void continuation_enter_cleanup(MacroAssembler* masm) {
1693 #ifdef ASSERT
1694   Label L_good_sp;
1695   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1696   __ jcc(Assembler::equal, L_good_sp);
1697   __ stop("Incorrect rsp at continuation_enter_cleanup");
1698   __ bind(L_good_sp);
1699 #endif
1700   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1701   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1702 
1703   if (CheckJNICalls) {
1704     // Check if this is a virtual thread continuation
1705     Label L_skip_vthread_code;
1706     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1707     __ jcc(Assembler::equal, L_skip_vthread_code);
1708 
1709     // If the held monitor count is > 0 and this vthread is terminating then
1710     // it failed to release a JNI monitor. So we issue the same log message
1711     // that JavaThread::exit does.
1712     __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1713     __ jcc(Assembler::equal, L_skip_vthread_code);
1714 
1715     // rax may hold an exception oop, save it before the call
1716     __ push(rax);
1717     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held));
1718     __ pop(rax);
1719 
1720     // For vthreads we have to explicitly zero the JNI monitor count of the carrier
1721     // on termination. The held count is implicitly zeroed below when we restore from
1722     // the parent held count (which has to be zero).
1723     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1724 
1725     __ bind(L_skip_vthread_code);
1726   }
1727 #ifdef ASSERT
1728   else {
1729     // Check if this is a virtual thread continuation
1730     Label L_skip_vthread_code;
1731     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1732     __ jcc(Assembler::equal, L_skip_vthread_code);
1733 
1734     // See comment just above. If not checking JNI calls the JNI count is only
1735     // needed for assertion checking.
1736     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1737 
1738     __ bind(L_skip_vthread_code);
1739   }
1740 #endif
1741 
1742   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1743   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1744 
1745   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1746   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1747   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1748 }
1749 
1750 static void gen_continuation_enter(MacroAssembler* masm,
1751                                    const VMRegPair* regs,
1752                                    int& exception_offset,
1753                                    OopMapSet* oop_maps,
1754                                    int& frame_complete,
1755                                    int& stack_slots,
1756                                    int& interpreted_entry_offset,
1757                                    int& compiled_entry_offset) {
1758 
1759   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1760   int pos_cont_obj   = 0;
1761   int pos_is_cont    = 1;
1762   int pos_is_virtual = 2;
1763 
1764   // The platform-specific calling convention may present the arguments in various registers.
1765   // To simplify the rest of the code, we expect the arguments to reside at these known
1766   // registers, and we additionally check the placement here in case calling convention ever
1767   // changes.
1768   Register reg_cont_obj   = c_rarg1;
1769   Register reg_is_cont    = c_rarg2;
1770   Register reg_is_virtual = c_rarg3;
1771 
1772   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1773   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1774   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1775 
1776   // Utility methods kill rax, make sure there are no collisions
1777   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1778 
1779   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1780                          relocInfo::static_call_type);
1781 
1782   address start = __ pc();
1783 
1784   Label L_thaw, L_exit;
1785 
1786   // i2i entry used at interp_only_mode only
1787   interpreted_entry_offset = __ pc() - start;
1788   {
1789 #ifdef ASSERT
1790     Label is_interp_only;
1791     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1792     __ jcc(Assembler::notEqual, is_interp_only);
1793     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1794     __ bind(is_interp_only);
1795 #endif
1796 
1797     __ pop(rax); // return address
1798     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1799     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1800     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1801     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1802     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1803     __ push(rax); // return address
1804     __ push_cont_fastpath();
1805 
1806     __ enter();
1807 
1808     stack_slots = 2; // will be adjusted in setup
1809     OopMap* map = continuation_enter_setup(masm, stack_slots);
1810     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1811     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1812 
1813     __ verify_oop(reg_cont_obj);
1814 
1815     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1816 
1817     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1818     __ testptr(reg_is_cont, reg_is_cont);
1819     __ jcc(Assembler::notZero, L_thaw);
1820 
1821     // --- Resolve path
1822 
1823     // Make sure the call is patchable
1824     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1825     // Emit stub for static call
1826     address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1827     if (stub == nullptr) {
1828       fatal("CodeCache is full at gen_continuation_enter");
1829     }
1830     __ call(resolve);
1831     oop_maps->add_gc_map(__ pc() - start, map);
1832     __ post_call_nop();
1833 
1834     __ jmp(L_exit);
1835   }
1836 
1837   // compiled entry
1838   __ align(CodeEntryAlignment);
1839   compiled_entry_offset = __ pc() - start;
1840   __ enter();
1841 
1842   stack_slots = 2; // will be adjusted in setup
1843   OopMap* map = continuation_enter_setup(masm, stack_slots);
1844 
1845   // Frame is now completed as far as size and linkage.
1846   frame_complete = __ pc() - start;
1847 
1848   __ verify_oop(reg_cont_obj);
1849 
1850   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1851 
1852   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1853   __ testptr(reg_is_cont, reg_is_cont);
1854   __ jccb(Assembler::notZero, L_thaw);
1855 
1856   // --- call Continuation.enter(Continuation c, boolean isContinue)
1857 
1858   // Make sure the call is patchable
1859   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1860 
1861   // Emit stub for static call
1862   address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1863   if (stub == nullptr) {
1864     fatal("CodeCache is full at gen_continuation_enter");
1865   }
1866 
1867   // The call needs to be resolved. There's a special case for this in
1868   // SharedRuntime::find_callee_info_helper() which calls
1869   // LinkResolver::resolve_continuation_enter() which resolves the call to
1870   // Continuation.enter(Continuation c, boolean isContinue).
1871   __ call(resolve);
1872 
1873   oop_maps->add_gc_map(__ pc() - start, map);
1874   __ post_call_nop();
1875 
1876   __ jmpb(L_exit);
1877 
1878   // --- Thawing path
1879 
1880   __ bind(L_thaw);
1881 
1882   ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1883   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1884 
1885   ContinuationEntry::_return_pc_offset = __ pc() - start;
1886   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1887   __ post_call_nop();
1888 
1889   // --- Normal exit (resolve/thawing)
1890 
1891   __ bind(L_exit);
1892   ContinuationEntry::_cleanup_offset = __ pc() - start;
1893   continuation_enter_cleanup(masm);
1894   __ pop(rbp);
1895   __ ret(0);
1896 
1897   // --- Exception handling path
1898 
1899   exception_offset = __ pc() - start;
1900 
1901   continuation_enter_cleanup(masm);
1902   __ pop(rbp);
1903 
1904   __ movptr(c_rarg0, r15_thread);
1905   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1906 
1907   // rax still holds the original exception oop, save it before the call
1908   __ push(rax);
1909 
1910   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1911   __ movptr(rbx, rax);
1912 
1913   // Continue at exception handler:
1914   //   rax: exception oop
1915   //   rbx: exception handler
1916   //   rdx: exception pc
1917   __ pop(rax);
1918   __ verify_oop(rax);
1919   __ pop(rdx);
1920   __ jmp(rbx);
1921 }
1922 
1923 static void gen_continuation_yield(MacroAssembler* masm,
1924                                    const VMRegPair* regs,
1925                                    OopMapSet* oop_maps,
1926                                    int& frame_complete,
1927                                    int& stack_slots,
1928                                    int& compiled_entry_offset) {
1929   enum layout {
1930     rbp_off,
1931     rbpH_off,
1932     return_off,
1933     return_off2,
1934     framesize // inclusive of return address
1935   };
1936   stack_slots = framesize /  VMRegImpl::slots_per_word;
1937   assert(stack_slots == 2, "recheck layout");
1938 
1939   address start = __ pc();
1940   compiled_entry_offset = __ pc() - start;
1941   __ enter();
1942   address the_pc = __ pc();
1943 
1944   frame_complete = the_pc - start;
1945 
1946   // This nop must be exactly at the PC we push into the frame info.
1947   // We use this nop for fast CodeBlob lookup, associate the OopMap
1948   // with it right away.
1949   __ post_call_nop();
1950   OopMap* map = new OopMap(framesize, 1);
1951   oop_maps->add_gc_map(frame_complete, map);
1952 
1953   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1954   __ movptr(c_rarg0, r15_thread);
1955   __ movptr(c_rarg1, rsp);
1956   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1957   __ reset_last_Java_frame(true);
1958 
1959   Label L_pinned;
1960 
1961   __ testptr(rax, rax);
1962   __ jcc(Assembler::notZero, L_pinned);
1963 
1964   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1965   continuation_enter_cleanup(masm);
1966   __ pop(rbp);
1967   __ ret(0);
1968 
1969   __ bind(L_pinned);
1970 
1971   // Pinned, return to caller
1972 
1973   // handle pending exception thrown by freeze
1974   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1975   Label ok;
1976   __ jcc(Assembler::equal, ok);
1977   __ leave();
1978   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1979   __ bind(ok);
1980 
1981   __ leave();
1982   __ ret(0);
1983 }
1984 
1985 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1986   ::continuation_enter_cleanup(masm);
1987 }
1988 
1989 static void gen_special_dispatch(MacroAssembler* masm,
1990                                  const methodHandle& method,
1991                                  const BasicType* sig_bt,
1992                                  const VMRegPair* regs) {
1993   verify_oop_args(masm, method, sig_bt, regs);
1994   vmIntrinsics::ID iid = method->intrinsic_id();
1995 
1996   // Now write the args into the outgoing interpreter space
1997   bool     has_receiver   = false;
1998   Register receiver_reg   = noreg;
1999   int      member_arg_pos = -1;
2000   Register member_reg     = noreg;
2001   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
2002   if (ref_kind != 0) {
2003     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
2004     member_reg = rbx;  // known to be free at this point
2005     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
2006   } else if (iid == vmIntrinsics::_invokeBasic) {
2007     has_receiver = true;
2008   } else if (iid == vmIntrinsics::_linkToNative) {
2009     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
2010     member_reg = rbx;  // known to be free at this point
2011   } else {
2012     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
2013   }
2014 
2015   if (member_reg != noreg) {
2016     // Load the member_arg into register, if necessary.
2017     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
2018     VMReg r = regs[member_arg_pos].first();
2019     if (r->is_stack()) {
2020       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
2021     } else {
2022       // no data motion is needed
2023       member_reg = r->as_Register();
2024     }
2025   }
2026 
2027   if (has_receiver) {
2028     // Make sure the receiver is loaded into a register.
2029     assert(method->size_of_parameters() > 0, "oob");
2030     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
2031     VMReg r = regs[0].first();
2032     assert(r->is_valid(), "bad receiver arg");
2033     if (r->is_stack()) {
2034       // Porting note:  This assumes that compiled calling conventions always
2035       // pass the receiver oop in a register.  If this is not true on some
2036       // platform, pick a temp and load the receiver from stack.
2037       fatal("receiver always in a register");
2038       receiver_reg = j_rarg0;  // known to be free at this point
2039       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
2040     } else {
2041       // no data motion is needed
2042       receiver_reg = r->as_Register();
2043     }
2044   }
2045 
2046   // Figure out which address we are really jumping to:
2047   MethodHandles::generate_method_handle_dispatch(masm, iid,
2048                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
2049 }
2050 
2051 // ---------------------------------------------------------------------------
2052 // Generate a native wrapper for a given method.  The method takes arguments
2053 // in the Java compiled code convention, marshals them to the native
2054 // convention (handlizes oops, etc), transitions to native, makes the call,
2055 // returns to java state (possibly blocking), unhandlizes any result and
2056 // returns.
2057 //
2058 // Critical native functions are a shorthand for the use of
2059 // GetPrimtiveArrayCritical and disallow the use of any other JNI
2060 // functions.  The wrapper is expected to unpack the arguments before
2061 // passing them to the callee. Critical native functions leave the state _in_Java,
2062 // since they cannot stop for GC.
2063 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
2064 // block and the check for pending exceptions it's impossible for them
2065 // to be thrown.
2066 //
2067 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
2068                                                 const methodHandle& method,
2069                                                 int compile_id,
2070                                                 BasicType* in_sig_bt,
2071                                                 VMRegPair* in_regs,
2072                                                 BasicType ret_type) {
2073   if (method->is_continuation_native_intrinsic()) {
2074     int exception_offset = -1;
2075     OopMapSet* oop_maps = new OopMapSet();
2076     int frame_complete = -1;
2077     int stack_slots = -1;
2078     int interpreted_entry_offset = -1;
2079     int vep_offset = -1;
2080     if (method->is_continuation_enter_intrinsic()) {
2081       gen_continuation_enter(masm,
2082                              in_regs,
2083                              exception_offset,
2084                              oop_maps,
2085                              frame_complete,
2086                              stack_slots,
2087                              interpreted_entry_offset,
2088                              vep_offset);
2089     } else if (method->is_continuation_yield_intrinsic()) {
2090       gen_continuation_yield(masm,
2091                              in_regs,
2092                              oop_maps,
2093                              frame_complete,
2094                              stack_slots,
2095                              vep_offset);
2096     } else {
2097       guarantee(false, "Unknown Continuation native intrinsic");
2098     }
2099 
2100 #ifdef ASSERT
2101     if (method->is_continuation_enter_intrinsic()) {
2102       assert(interpreted_entry_offset != -1, "Must be set");
2103       assert(exception_offset != -1,         "Must be set");
2104     } else {
2105       assert(interpreted_entry_offset == -1, "Must be unset");
2106       assert(exception_offset == -1,         "Must be unset");
2107     }
2108     assert(frame_complete != -1,    "Must be set");
2109     assert(stack_slots != -1,       "Must be set");
2110     assert(vep_offset != -1,        "Must be set");
2111 #endif
2112 
2113     __ flush();
2114     nmethod* nm = nmethod::new_native_nmethod(method,
2115                                               compile_id,
2116                                               masm->code(),
2117                                               vep_offset,
2118                                               frame_complete,
2119                                               stack_slots,
2120                                               in_ByteSize(-1),
2121                                               in_ByteSize(-1),
2122                                               oop_maps,
2123                                               exception_offset);
2124     if (nm == nullptr) return nm;
2125     if (method->is_continuation_enter_intrinsic()) {
2126       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
2127     } else if (method->is_continuation_yield_intrinsic()) {
2128       _cont_doYield_stub = nm;
2129     }
2130     return nm;
2131   }
2132 
2133   if (method->is_method_handle_intrinsic()) {
2134     vmIntrinsics::ID iid = method->intrinsic_id();
2135     intptr_t start = (intptr_t)__ pc();
2136     int vep_offset = ((intptr_t)__ pc()) - start;
2137     gen_special_dispatch(masm,
2138                          method,
2139                          in_sig_bt,
2140                          in_regs);
2141     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
2142     __ flush();
2143     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
2144     return nmethod::new_native_nmethod(method,
2145                                        compile_id,
2146                                        masm->code(),
2147                                        vep_offset,
2148                                        frame_complete,
2149                                        stack_slots / VMRegImpl::slots_per_word,
2150                                        in_ByteSize(-1),
2151                                        in_ByteSize(-1),
2152                                        nullptr);
2153   }
2154   address native_func = method->native_function();
2155   assert(native_func != nullptr, "must have function");
2156 
2157   // An OopMap for lock (and class if static)
2158   OopMapSet *oop_maps = new OopMapSet();
2159   intptr_t start = (intptr_t)__ pc();
2160 
2161   // We have received a description of where all the java arg are located
2162   // on entry to the wrapper. We need to convert these args to where
2163   // the jni function will expect them. To figure out where they go
2164   // we convert the java signature to a C signature by inserting
2165   // the hidden arguments as arg[0] and possibly arg[1] (static method)
2166 
2167   const int total_in_args = method->size_of_parameters();
2168   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
2169 
2170   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
2171   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
2172 
2173   int argc = 0;
2174   out_sig_bt[argc++] = T_ADDRESS;
2175   if (method->is_static()) {
2176     out_sig_bt[argc++] = T_OBJECT;
2177   }
2178 
2179   for (int i = 0; i < total_in_args ; i++ ) {
2180     out_sig_bt[argc++] = in_sig_bt[i];
2181   }
2182 
2183   // Now figure out where the args must be stored and how much stack space
2184   // they require.
2185   int out_arg_slots;
2186   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
2187 
2188   // Compute framesize for the wrapper.  We need to handlize all oops in
2189   // incoming registers
2190 
2191   // Calculate the total number of stack slots we will need.
2192 
2193   // First count the abi requirement plus all of the outgoing args
2194   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
2195 
2196   // Now the space for the inbound oop handle area
2197   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
2198 
2199   int oop_handle_offset = stack_slots;
2200   stack_slots += total_save_slots;
2201 
2202   // Now any space we need for handlizing a klass if static method
2203 
2204   int klass_slot_offset = 0;
2205   int klass_offset = -1;
2206   int lock_slot_offset = 0;
2207   bool is_static = false;
2208 
2209   if (method->is_static()) {
2210     klass_slot_offset = stack_slots;
2211     stack_slots += VMRegImpl::slots_per_word;
2212     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
2213     is_static = true;
2214   }
2215 
2216   // Plus a lock if needed
2217 
2218   if (method->is_synchronized()) {
2219     lock_slot_offset = stack_slots;
2220     stack_slots += VMRegImpl::slots_per_word;
2221   }
2222 
2223   // Now a place (+2) to save return values or temp during shuffling
2224   // + 4 for return address (which we own) and saved rbp
2225   stack_slots += 6;
2226 
2227   // Ok The space we have allocated will look like:
2228   //
2229   //
2230   // FP-> |                     |
2231   //      |---------------------|
2232   //      | 2 slots for moves   |
2233   //      |---------------------|
2234   //      | lock box (if sync)  |
2235   //      |---------------------| <- lock_slot_offset
2236   //      | klass (if static)   |
2237   //      |---------------------| <- klass_slot_offset
2238   //      | oopHandle area      |
2239   //      |---------------------| <- oop_handle_offset (6 java arg registers)
2240   //      | outbound memory     |
2241   //      | based arguments     |
2242   //      |                     |
2243   //      |---------------------|
2244   //      |                     |
2245   // SP-> | out_preserved_slots |
2246   //
2247   //
2248 
2249 
2250   // Now compute actual number of stack words we need rounding to make
2251   // stack properly aligned.
2252   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
2253 
2254   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
2255 
2256   // First thing make an ic check to see if we should even be here
2257 
2258   // We are free to use all registers as temps without saving them and
2259   // restoring them except rbp. rbp is the only callee save register
2260   // as far as the interpreter and the compiler(s) are concerned.
2261 
2262   const Register receiver = j_rarg0;
2263 
2264   Label exception_pending;
2265 
2266   assert_different_registers(receiver, rscratch1, rscratch2);
2267   __ verify_oop(receiver);
2268   __ ic_check(8 /* end_alignment */);
2269 
2270   int vep_offset = ((intptr_t)__ pc()) - start;
2271 
2272   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
2273     Label L_skip_barrier;
2274     Register klass = r10;
2275     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
2276     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
2277 
2278     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
2279 
2280     __ bind(L_skip_barrier);
2281   }
2282 
2283 #ifdef COMPILER1
2284   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2285   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2286     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2287   }
2288 #endif // COMPILER1
2289 
2290   // The instruction at the verified entry point must be 5 bytes or longer
2291   // because it can be patched on the fly by make_non_entrant. The stack bang
2292   // instruction fits that requirement.
2293 
2294   // Generate stack overflow check
2295   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
2296 
2297   // Generate a new frame for the wrapper.
2298   __ enter();
2299   // -2 because return address is already present and so is saved rbp
2300   __ subptr(rsp, stack_size - 2*wordSize);
2301 
2302   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2303   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
2304   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
2305 
2306   // Frame is now completed as far as size and linkage.
2307   int frame_complete = ((intptr_t)__ pc()) - start;
2308 
2309 #ifdef ASSERT
2310   __ check_stack_alignment(rsp, "improperly aligned stack");
2311 #endif /* ASSERT */
2312 
2313 
2314   // We use r14 as the oop handle for the receiver/klass
2315   // It is callee save so it survives the call to native
2316 
2317   const Register oop_handle_reg = r14;
2318 
2319   //
2320   // We immediately shuffle the arguments so that any vm call we have to
2321   // make from here on out (sync slow path, jvmti, etc.) we will have
2322   // captured the oops from our caller and have a valid oopMap for
2323   // them.
2324 
2325   // -----------------
2326   // The Grand Shuffle
2327 
2328   // The Java calling convention is either equal (linux) or denser (win64) than the
2329   // c calling convention. However the because of the jni_env argument the c calling
2330   // convention always has at least one more (and two for static) arguments than Java.
2331   // Therefore if we move the args from java -> c backwards then we will never have
2332   // a register->register conflict and we don't have to build a dependency graph
2333   // and figure out how to break any cycles.
2334   //
2335 
2336   // Record esp-based slot for receiver on stack for non-static methods
2337   int receiver_offset = -1;
2338 
2339   // This is a trick. We double the stack slots so we can claim
2340   // the oops in the caller's frame. Since we are sure to have
2341   // more args than the caller doubling is enough to make
2342   // sure we can capture all the incoming oop args from the
2343   // caller.
2344   //
2345   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2346 
2347   // Mark location of rbp (someday)
2348   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2349 
2350   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2351   // All inbound args are referenced based on rbp and all outbound args via rsp.
2352 
2353 
2354 #ifdef ASSERT
2355   bool reg_destroyed[Register::number_of_registers];
2356   bool freg_destroyed[XMMRegister::number_of_registers];
2357   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2358     reg_destroyed[r] = false;
2359   }
2360   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2361     freg_destroyed[f] = false;
2362   }
2363 
2364 #endif /* ASSERT */
2365 
2366   // For JNI natives the incoming and outgoing registers are offset upwards.
2367   GrowableArray<int> arg_order(2 * total_in_args);
2368 
2369   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2370     arg_order.push(i);
2371     arg_order.push(c_arg);
2372   }
2373 
2374   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2375     int i = arg_order.at(ai);
2376     int c_arg = arg_order.at(ai + 1);
2377     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2378 #ifdef ASSERT
2379     if (in_regs[i].first()->is_Register()) {
2380       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2381     } else if (in_regs[i].first()->is_XMMRegister()) {
2382       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2383     }
2384     if (out_regs[c_arg].first()->is_Register()) {
2385       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2386     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2387       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2388     }
2389 #endif /* ASSERT */
2390     switch (in_sig_bt[i]) {
2391       case T_ARRAY:
2392       case T_OBJECT:
2393         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2394                     ((i == 0) && (!is_static)),
2395                     &receiver_offset);
2396         break;
2397       case T_VOID:
2398         break;
2399 
2400       case T_FLOAT:
2401         __ float_move(in_regs[i], out_regs[c_arg]);
2402           break;
2403 
2404       case T_DOUBLE:
2405         assert( i + 1 < total_in_args &&
2406                 in_sig_bt[i + 1] == T_VOID &&
2407                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2408         __ double_move(in_regs[i], out_regs[c_arg]);
2409         break;
2410 
2411       case T_LONG :
2412         __ long_move(in_regs[i], out_regs[c_arg]);
2413         break;
2414 
2415       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2416 
2417       default:
2418         __ move32_64(in_regs[i], out_regs[c_arg]);
2419     }
2420   }
2421 
2422   int c_arg;
2423 
2424   // Pre-load a static method's oop into r14.  Used both by locking code and
2425   // the normal JNI call code.
2426   // point c_arg at the first arg that is already loaded in case we
2427   // need to spill before we call out
2428   c_arg = total_c_args - total_in_args;
2429 
2430   if (method->is_static()) {
2431 
2432     //  load oop into a register
2433     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2434 
2435     // Now handlize the static class mirror it's known not-null.
2436     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2437     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2438 
2439     // Now get the handle
2440     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2441     // store the klass handle as second argument
2442     __ movptr(c_rarg1, oop_handle_reg);
2443     // and protect the arg if we must spill
2444     c_arg--;
2445   }
2446 
2447   // Change state to native (we save the return address in the thread, since it might not
2448   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2449   // points into the right code segment. It does not have to be the correct return pc.
2450   // We use the same pc/oopMap repeatedly when we call out
2451 
2452   Label native_return;
2453   if (LockingMode != LM_LEGACY && method->is_object_wait0()) {
2454     // For convenience we use the pc we want to resume to in case of preemption on Object.wait.
2455     __ set_last_Java_frame(rsp, noreg, native_return, rscratch1);
2456   } else {
2457     intptr_t the_pc = (intptr_t) __ pc();
2458     oop_maps->add_gc_map(the_pc - start, map);
2459 
2460     __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1);
2461   }
2462 
2463   // We have all of the arguments setup at this point. We must not touch any register
2464   // argument registers at this point (what if we save/restore them there are no oop?
2465 
2466   if (DTraceMethodProbes) {
2467     // protect the args we've loaded
2468     save_args(masm, total_c_args, c_arg, out_regs);
2469     __ mov_metadata(c_rarg1, method());
2470     __ call_VM_leaf(
2471       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2472       r15_thread, c_rarg1);
2473     restore_args(masm, total_c_args, c_arg, out_regs);
2474   }
2475 
2476   // RedefineClasses() tracing support for obsolete method entry
2477   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2478     // protect the args we've loaded
2479     save_args(masm, total_c_args, c_arg, out_regs);
2480     __ mov_metadata(c_rarg1, method());
2481     __ call_VM_leaf(
2482       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2483       r15_thread, c_rarg1);
2484     restore_args(masm, total_c_args, c_arg, out_regs);
2485   }
2486 
2487   // Lock a synchronized method
2488 
2489   // Register definitions used by locking and unlocking
2490 
2491   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2492   const Register obj_reg  = rbx;  // Will contain the oop
2493   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2494   const Register old_hdr  = r13;  // value of old header at unlock time
2495 
2496   Label slow_path_lock;
2497   Label lock_done;
2498 
2499   if (method->is_synchronized()) {
2500     Label count_mon;
2501 
2502     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2503 
2504     // Get the handle (the 2nd argument)
2505     __ mov(oop_handle_reg, c_rarg1);
2506 
2507     // Get address of the box
2508 
2509     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2510 
2511     // Load the oop from the handle
2512     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2513 
2514     if (LockingMode == LM_MONITOR) {
2515       __ jmp(slow_path_lock);
2516     } else if (LockingMode == LM_LEGACY) {
2517       // Load immediate 1 into swap_reg %rax
2518       __ movl(swap_reg, 1);
2519 
2520       // Load (object->mark() | 1) into swap_reg %rax
2521       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2522       if (EnableValhalla) {
2523         // Mask inline_type bit such that we go to the slow path if object is an inline type
2524         __ andptr(swap_reg, ~((int) markWord::inline_type_bit_in_place));
2525       }
2526 
2527       // Save (object->mark() | 1) into BasicLock's displaced header
2528       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2529 
2530       // src -> dest iff dest == rax else rax <- dest
2531       __ lock();
2532       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2533       __ jcc(Assembler::equal, count_mon);
2534 
2535       // Hmm should this move to the slow path code area???
2536 
2537       // Test if the oopMark is an obvious stack pointer, i.e.,
2538       //  1) (mark & 3) == 0, and
2539       //  2) rsp <= mark < mark + os::pagesize()
2540       // These 3 tests can be done by evaluating the following
2541       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2542       // assuming both stack pointer and pagesize have their
2543       // least significant 2 bits clear.
2544       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2545 
2546       __ subptr(swap_reg, rsp);
2547       __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2548 
2549       // Save the test result, for recursive case, the result is zero
2550       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2551       __ jcc(Assembler::notEqual, slow_path_lock);
2552 
2553       __ bind(count_mon);
2554       __ inc_held_monitor_count();
2555     } else {
2556       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2557       __ lightweight_lock(lock_reg, obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock);
2558     }
2559 
2560     // Slow path will re-enter here
2561     __ bind(lock_done);
2562   }
2563 
2564   // Finally just about ready to make the JNI call
2565 
2566   // get JNIEnv* which is first argument to native
2567   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2568 
2569   // Now set thread in native
2570   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2571 
2572   __ call(RuntimeAddress(native_func));
2573 
2574   // Verify or restore cpu control state after JNI call
2575   __ restore_cpu_control_state_after_jni(rscratch1);
2576 
2577   // Unpack native results.
2578   switch (ret_type) {
2579   case T_BOOLEAN: __ c2bool(rax);            break;
2580   case T_CHAR   : __ movzwl(rax, rax);      break;
2581   case T_BYTE   : __ sign_extend_byte (rax); break;
2582   case T_SHORT  : __ sign_extend_short(rax); break;
2583   case T_INT    : /* nothing to do */        break;
2584   case T_DOUBLE :
2585   case T_FLOAT  :
2586     // Result is in xmm0 we'll save as needed
2587     break;
2588   case T_ARRAY:                 // Really a handle
2589   case T_OBJECT:                // Really a handle
2590       break; // can't de-handlize until after safepoint check
2591   case T_VOID: break;
2592   case T_LONG: break;
2593   default       : ShouldNotReachHere();
2594   }
2595 
2596   // Switch thread to "native transition" state before reading the synchronization state.
2597   // This additional state is necessary because reading and testing the synchronization
2598   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2599   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2600   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2601   //     Thread A is resumed to finish this native method, but doesn't block here since it
2602   //     didn't see any synchronization is progress, and escapes.
2603   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2604 
2605   // Force this write out before the read below
2606   if (!UseSystemMemoryBarrier) {
2607     __ membar(Assembler::Membar_mask_bits(
2608               Assembler::LoadLoad | Assembler::LoadStore |
2609               Assembler::StoreLoad | Assembler::StoreStore));
2610   }
2611 
2612   // check for safepoint operation in progress and/or pending suspend requests
2613   {
2614     Label Continue;
2615     Label slow_path;
2616 
2617     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2618 
2619     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2620     __ jcc(Assembler::equal, Continue);
2621     __ bind(slow_path);
2622 
2623     // Don't use call_VM as it will see a possible pending exception and forward it
2624     // and never return here preventing us from clearing _last_native_pc down below.
2625     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2626     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2627     // by hand.
2628     //
2629     __ vzeroupper();
2630     save_native_result(masm, ret_type, stack_slots);
2631     __ mov(c_rarg0, r15_thread);
2632     __ mov(r12, rsp); // remember sp
2633     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2634     __ andptr(rsp, -16); // align stack as required by ABI
2635     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2636     __ mov(rsp, r12); // restore sp
2637     __ reinit_heapbase();
2638     // Restore any method result value
2639     restore_native_result(masm, ret_type, stack_slots);
2640     __ bind(Continue);
2641   }
2642 
2643   // change thread state
2644   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2645 
2646   if (LockingMode != LM_LEGACY && method->is_object_wait0()) {
2647     // Check preemption for Object.wait()
2648     __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset()));
2649     __ cmpptr(rscratch1, NULL_WORD);
2650     __ jccb(Assembler::equal, native_return);
2651     __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD);
2652     __ jmp(rscratch1);
2653     __ bind(native_return);
2654 
2655     intptr_t the_pc = (intptr_t) __ pc();
2656     oop_maps->add_gc_map(the_pc - start, map);
2657   }
2658 
2659 
2660   Label reguard;
2661   Label reguard_done;
2662   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2663   __ jcc(Assembler::equal, reguard);
2664   __ bind(reguard_done);
2665 
2666   // native result if any is live
2667 
2668   // Unlock
2669   Label slow_path_unlock;
2670   Label unlock_done;
2671   if (method->is_synchronized()) {
2672 
2673     Label fast_done;
2674 
2675     // Get locked oop from the handle we passed to jni
2676     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2677 
2678     if (LockingMode == LM_LEGACY) {
2679       Label not_recur;
2680       // Simple recursive lock?
2681       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2682       __ jcc(Assembler::notEqual, not_recur);
2683       __ dec_held_monitor_count();
2684       __ jmpb(fast_done);
2685       __ bind(not_recur);
2686     }
2687 
2688     // Must save rax if it is live now because cmpxchg must use it
2689     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2690       save_native_result(masm, ret_type, stack_slots);
2691     }
2692 
2693     if (LockingMode == LM_MONITOR) {
2694       __ jmp(slow_path_unlock);
2695     } else if (LockingMode == LM_LEGACY) {
2696       // get address of the stack lock
2697       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2698       //  get old displaced header
2699       __ movptr(old_hdr, Address(rax, 0));
2700 
2701       // Atomic swap old header if oop still contains the stack lock
2702       __ lock();
2703       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2704       __ jcc(Assembler::notEqual, slow_path_unlock);
2705       __ dec_held_monitor_count();
2706     } else {
2707       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2708       __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock);
2709     }
2710 
2711     // slow path re-enters here
2712     __ bind(unlock_done);
2713     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2714       restore_native_result(masm, ret_type, stack_slots);
2715     }
2716 
2717     __ bind(fast_done);
2718   }
2719   if (DTraceMethodProbes) {
2720     save_native_result(masm, ret_type, stack_slots);
2721     __ mov_metadata(c_rarg1, method());
2722     __ call_VM_leaf(
2723          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2724          r15_thread, c_rarg1);
2725     restore_native_result(masm, ret_type, stack_slots);
2726   }
2727 
2728   __ reset_last_Java_frame(false);
2729 
2730   // Unbox oop result, e.g. JNIHandles::resolve value.
2731   if (is_reference_type(ret_type)) {
2732     __ resolve_jobject(rax /* value */,
2733                        r15_thread /* thread */,
2734                        rcx /* tmp */);
2735   }
2736 
2737   if (CheckJNICalls) {
2738     // clear_pending_jni_exception_check
2739     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2740   }
2741 
2742   // reset handle block
2743   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2744   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2745 
2746   // pop our frame
2747 
2748   __ leave();
2749 
2750   // Any exception pending?
2751   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2752   __ jcc(Assembler::notEqual, exception_pending);
2753 
2754   // Return
2755 
2756   __ ret(0);
2757 
2758   // Unexpected paths are out of line and go here
2759 
2760   // forward the exception
2761   __ bind(exception_pending);
2762 
2763   // and forward the exception
2764   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2765 
2766   // Slow path locking & unlocking
2767   if (method->is_synchronized()) {
2768 
2769     // BEGIN Slow path lock
2770     __ bind(slow_path_lock);
2771 
2772     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2773     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2774 
2775     // protect the args we've loaded
2776     save_args(masm, total_c_args, c_arg, out_regs);
2777 
2778     __ mov(c_rarg0, obj_reg);
2779     __ mov(c_rarg1, lock_reg);
2780     __ mov(c_rarg2, r15_thread);
2781 
2782     // Not a leaf but we have last_Java_frame setup as we want.
2783     // We don't want to unmount in case of contention since that would complicate preserving
2784     // the arguments that had already been marshalled into the native convention. So we force
2785     // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame())
2786     // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack.
2787     __ push_cont_fastpath();
2788     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2789     __ pop_cont_fastpath();
2790     restore_args(masm, total_c_args, c_arg, out_regs);
2791 
2792 #ifdef ASSERT
2793     { Label L;
2794     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2795     __ jcc(Assembler::equal, L);
2796     __ stop("no pending exception allowed on exit from monitorenter");
2797     __ bind(L);
2798     }
2799 #endif
2800     __ jmp(lock_done);
2801 
2802     // END Slow path lock
2803 
2804     // BEGIN Slow path unlock
2805     __ bind(slow_path_unlock);
2806 
2807     // If we haven't already saved the native result we must save it now as xmm registers
2808     // are still exposed.
2809     __ vzeroupper();
2810     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2811       save_native_result(masm, ret_type, stack_slots);
2812     }
2813 
2814     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2815 
2816     __ mov(c_rarg0, obj_reg);
2817     __ mov(c_rarg2, r15_thread);
2818     __ mov(r12, rsp); // remember sp
2819     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2820     __ andptr(rsp, -16); // align stack as required by ABI
2821 
2822     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2823     // NOTE that obj_reg == rbx currently
2824     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2825     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2826 
2827     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2828     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2829     __ mov(rsp, r12); // restore sp
2830     __ reinit_heapbase();
2831 #ifdef ASSERT
2832     {
2833       Label L;
2834       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2835       __ jcc(Assembler::equal, L);
2836       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2837       __ bind(L);
2838     }
2839 #endif /* ASSERT */
2840 
2841     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2842 
2843     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2844       restore_native_result(masm, ret_type, stack_slots);
2845     }
2846     __ jmp(unlock_done);
2847 
2848     // END Slow path unlock
2849 
2850   } // synchronized
2851 
2852   // SLOW PATH Reguard the stack if needed
2853 
2854   __ bind(reguard);
2855   __ vzeroupper();
2856   save_native_result(masm, ret_type, stack_slots);
2857   __ mov(r12, rsp); // remember sp
2858   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2859   __ andptr(rsp, -16); // align stack as required by ABI
2860   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2861   __ mov(rsp, r12); // restore sp
2862   __ reinit_heapbase();
2863   restore_native_result(masm, ret_type, stack_slots);
2864   // and continue
2865   __ jmp(reguard_done);
2866 
2867 
2868 
2869   __ flush();
2870 
2871   nmethod *nm = nmethod::new_native_nmethod(method,
2872                                             compile_id,
2873                                             masm->code(),
2874                                             vep_offset,
2875                                             frame_complete,
2876                                             stack_slots / VMRegImpl::slots_per_word,
2877                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2878                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2879                                             oop_maps);
2880 
2881   return nm;
2882 }
2883 
2884 // this function returns the adjust size (in number of words) to a c2i adapter
2885 // activation for use during deoptimization
2886 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2887   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2888 }
2889 
2890 
2891 uint SharedRuntime::out_preserve_stack_slots() {
2892   return 0;
2893 }
2894 
2895 
2896 // Number of stack slots between incoming argument block and the start of
2897 // a new frame.  The PROLOG must add this many slots to the stack.  The
2898 // EPILOG must remove this many slots.  amd64 needs two slots for
2899 // return address.
2900 uint SharedRuntime::in_preserve_stack_slots() {
2901   return 4 + 2 * VerifyStackAtCalls;
2902 }
2903 
2904 VMReg SharedRuntime::thread_register() {
2905   return r15_thread->as_VMReg();
2906 }
2907 
2908 //------------------------------generate_deopt_blob----------------------------
2909 void SharedRuntime::generate_deopt_blob() {
2910   // Allocate space for the code
2911   ResourceMark rm;
2912   // Setup code generation tools
2913   int pad = 0;
2914   if (UseAVX > 2) {
2915     pad += 1024;
2916   }
2917   if (UseAPX) {
2918     pad += 1024;
2919   }
2920 #if INCLUDE_JVMCI
2921   if (EnableJVMCI) {
2922     pad += 512; // Increase the buffer size when compiling for JVMCI
2923   }
2924 #endif
2925   const char* name = SharedRuntime::stub_name(SharedStubId::deopt_id);
2926   CodeBuffer buffer(name, 2560+pad, 1024);
2927   MacroAssembler* masm = new MacroAssembler(&buffer);
2928   int frame_size_in_words;
2929   OopMap* map = nullptr;
2930   OopMapSet *oop_maps = new OopMapSet();
2931 
2932   // -------------
2933   // This code enters when returning to a de-optimized nmethod.  A return
2934   // address has been pushed on the stack, and return values are in
2935   // registers.
2936   // If we are doing a normal deopt then we were called from the patched
2937   // nmethod from the point we returned to the nmethod. So the return
2938   // address on the stack is wrong by NativeCall::instruction_size
2939   // We will adjust the value so it looks like we have the original return
2940   // address on the stack (like when we eagerly deoptimized).
2941   // In the case of an exception pending when deoptimizing, we enter
2942   // with a return address on the stack that points after the call we patched
2943   // into the exception handler. We have the following register state from,
2944   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2945   //    rax: exception oop
2946   //    rbx: exception handler
2947   //    rdx: throwing pc
2948   // So in this case we simply jam rdx into the useless return address and
2949   // the stack looks just like we want.
2950   //
2951   // At this point we need to de-opt.  We save the argument return
2952   // registers.  We call the first C routine, fetch_unroll_info().  This
2953   // routine captures the return values and returns a structure which
2954   // describes the current frame size and the sizes of all replacement frames.
2955   // The current frame is compiled code and may contain many inlined
2956   // functions, each with their own JVM state.  We pop the current frame, then
2957   // push all the new frames.  Then we call the C routine unpack_frames() to
2958   // populate these frames.  Finally unpack_frames() returns us the new target
2959   // address.  Notice that callee-save registers are BLOWN here; they have
2960   // already been captured in the vframeArray at the time the return PC was
2961   // patched.
2962   address start = __ pc();
2963   Label cont;
2964 
2965   // Prolog for non exception case!
2966 
2967   // Save everything in sight.
2968   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2969 
2970   // Normal deoptimization.  Save exec mode for unpack_frames.
2971   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2972   __ jmp(cont);
2973 
2974   int reexecute_offset = __ pc() - start;
2975 #if INCLUDE_JVMCI && !defined(COMPILER1)
2976   if (UseJVMCICompiler) {
2977     // JVMCI does not use this kind of deoptimization
2978     __ should_not_reach_here();
2979   }
2980 #endif
2981 
2982   // Reexecute case
2983   // return address is the pc describes what bci to do re-execute at
2984 
2985   // No need to update map as each call to save_live_registers will produce identical oopmap
2986   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2987 
2988   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2989   __ jmp(cont);
2990 
2991 #if INCLUDE_JVMCI
2992   Label after_fetch_unroll_info_call;
2993   int implicit_exception_uncommon_trap_offset = 0;
2994   int uncommon_trap_offset = 0;
2995 
2996   if (EnableJVMCI) {
2997     implicit_exception_uncommon_trap_offset = __ pc() - start;
2998 
2999     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
3000     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
3001 
3002     uncommon_trap_offset = __ pc() - start;
3003 
3004     // Save everything in sight.
3005     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
3006     // fetch_unroll_info needs to call last_java_frame()
3007     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3008 
3009     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
3010     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
3011 
3012     __ movl(r14, Deoptimization::Unpack_reexecute);
3013     __ mov(c_rarg0, r15_thread);
3014     __ movl(c_rarg2, r14); // exec mode
3015     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
3016     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
3017 
3018     __ reset_last_Java_frame(false);
3019 
3020     __ jmp(after_fetch_unroll_info_call);
3021   } // EnableJVMCI
3022 #endif // INCLUDE_JVMCI
3023 
3024   int exception_offset = __ pc() - start;
3025 
3026   // Prolog for exception case
3027 
3028   // all registers are dead at this entry point, except for rax, and
3029   // rdx which contain the exception oop and exception pc
3030   // respectively.  Set them in TLS and fall thru to the
3031   // unpack_with_exception_in_tls entry point.
3032 
3033   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3034   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
3035 
3036   int exception_in_tls_offset = __ pc() - start;
3037 
3038   // new implementation because exception oop is now passed in JavaThread
3039 
3040   // Prolog for exception case
3041   // All registers must be preserved because they might be used by LinearScan
3042   // Exceptiop oop and throwing PC are passed in JavaThread
3043   // tos: stack at point of call to method that threw the exception (i.e. only
3044   // args are on the stack, no return address)
3045 
3046   // make room on stack for the return address
3047   // It will be patched later with the throwing pc. The correct value is not
3048   // available now because loading it from memory would destroy registers.
3049   __ push(0);
3050 
3051   // Save everything in sight.
3052   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
3053 
3054   // Now it is safe to overwrite any register
3055 
3056   // Deopt during an exception.  Save exec mode for unpack_frames.
3057   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
3058 
3059   // load throwing pc from JavaThread and patch it as the return address
3060   // of the current frame. Then clear the field in JavaThread
3061 
3062   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3063   __ movptr(Address(rbp, wordSize), rdx);
3064   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3065 
3066 #ifdef ASSERT
3067   // verify that there is really an exception oop in JavaThread
3068   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3069   __ verify_oop(rax);
3070 
3071   // verify that there is no pending exception
3072   Label no_pending_exception;
3073   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3074   __ testptr(rax, rax);
3075   __ jcc(Assembler::zero, no_pending_exception);
3076   __ stop("must not have pending exception here");
3077   __ bind(no_pending_exception);
3078 #endif
3079 
3080   __ bind(cont);
3081 
3082   // Call C code.  Need thread and this frame, but NOT official VM entry
3083   // crud.  We cannot block on this call, no GC can happen.
3084   //
3085   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
3086 
3087   // fetch_unroll_info needs to call last_java_frame().
3088 
3089   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3090 #ifdef ASSERT
3091   { Label L;
3092     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3093     __ jcc(Assembler::equal, L);
3094     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
3095     __ bind(L);
3096   }
3097 #endif // ASSERT
3098   __ mov(c_rarg0, r15_thread);
3099   __ movl(c_rarg1, r14); // exec_mode
3100   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
3101 
3102   // Need to have an oopmap that tells fetch_unroll_info where to
3103   // find any register it might need.
3104   oop_maps->add_gc_map(__ pc() - start, map);
3105 
3106   __ reset_last_Java_frame(false);
3107 
3108 #if INCLUDE_JVMCI
3109   if (EnableJVMCI) {
3110     __ bind(after_fetch_unroll_info_call);
3111   }
3112 #endif
3113 
3114   // Load UnrollBlock* into rdi
3115   __ mov(rdi, rax);
3116 
3117   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
3118    Label noException;
3119   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
3120   __ jcc(Assembler::notEqual, noException);
3121   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3122   // QQQ this is useless it was null above
3123   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3124   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3125   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3126 
3127   __ verify_oop(rax);
3128 
3129   // Overwrite the result registers with the exception results.
3130   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3131   // I think this is useless
3132   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
3133 
3134   __ bind(noException);
3135 
3136   // Only register save data is on the stack.
3137   // Now restore the result registers.  Everything else is either dead
3138   // or captured in the vframeArray.
3139   RegisterSaver::restore_result_registers(masm);
3140 
3141   // All of the register save area has been popped of the stack. Only the
3142   // return address remains.
3143 
3144   // Pop all the frames we must move/replace.
3145   //
3146   // Frame picture (youngest to oldest)
3147   // 1: self-frame (no frame link)
3148   // 2: deopting frame  (no frame link)
3149   // 3: caller of deopting frame (could be compiled/interpreted).
3150   //
3151   // Note: by leaving the return address of self-frame on the stack
3152   // and using the size of frame 2 to adjust the stack
3153   // when we are done the return to frame 3 will still be on the stack.
3154 
3155   // Pop deoptimized frame
3156   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
3157   __ addptr(rsp, rcx);
3158 
3159   // rsp should be pointing at the return address to the caller (3)
3160 
3161   // Pick up the initial fp we should save
3162   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3163   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
3164 
3165 #ifdef ASSERT
3166   // Compilers generate code that bang the stack by as much as the
3167   // interpreter would need. So this stack banging should never
3168   // trigger a fault. Verify that it does not on non product builds.
3169   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
3170   __ bang_stack_size(rbx, rcx);
3171 #endif
3172 
3173   // Load address of array of frame pcs into rcx
3174   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
3175 
3176   // Trash the old pc
3177   __ addptr(rsp, wordSize);
3178 
3179   // Load address of array of frame sizes into rsi
3180   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
3181 
3182   // Load counter into rdx
3183   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
3184 
3185   // Now adjust the caller's stack to make up for the extra locals
3186   // but record the original sp so that we can save it in the skeletal interpreter
3187   // frame and the stack walking of interpreter_sender will get the unextended sp
3188   // value and not the "real" sp value.
3189 
3190   const Register sender_sp = r8;
3191 
3192   __ mov(sender_sp, rsp);
3193   __ movl(rbx, Address(rdi,
3194                        Deoptimization::UnrollBlock::
3195                        caller_adjustment_offset()));
3196   __ subptr(rsp, rbx);
3197 
3198   // Push interpreter frames in a loop
3199   Label loop;
3200   __ bind(loop);
3201   __ movptr(rbx, Address(rsi, 0));      // Load frame size
3202   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
3203   __ pushptr(Address(rcx, 0));          // Save return address
3204   __ enter();                           // Save old & set new ebp
3205   __ subptr(rsp, rbx);                  // Prolog
3206   // This value is corrected by layout_activation_impl
3207   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3208   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
3209   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
3210   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
3211   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
3212   __ decrementl(rdx);                   // Decrement counter
3213   __ jcc(Assembler::notZero, loop);
3214   __ pushptr(Address(rcx, 0));          // Save final return address
3215 
3216   // Re-push self-frame
3217   __ enter();                           // Save old & set new ebp
3218 
3219   // Allocate a full sized register save area.
3220   // Return address and rbp are in place, so we allocate two less words.
3221   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
3222 
3223   // Restore frame locals after moving the frame
3224   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
3225   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3226 
3227   // Call C code.  Need thread but NOT official VM entry
3228   // crud.  We cannot block on this call, no GC can happen.  Call should
3229   // restore return values to their stack-slots with the new SP.
3230   //
3231   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
3232 
3233   // Use rbp because the frames look interpreted now
3234   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3235   // Don't need the precise return PC here, just precise enough to point into this code blob.
3236   address the_pc = __ pc();
3237   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3238 
3239   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
3240   __ mov(c_rarg0, r15_thread);
3241   __ movl(c_rarg1, r14); // second arg: exec_mode
3242   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3243   // Revert SP alignment after call since we're going to do some SP relative addressing below
3244   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
3245 
3246   // Set an oopmap for the call site
3247   // Use the same PC we used for the last java frame
3248   oop_maps->add_gc_map(the_pc - start,
3249                        new OopMap( frame_size_in_words, 0 ));
3250 
3251   // Clear fp AND pc
3252   __ reset_last_Java_frame(true);
3253 
3254   // Collect return values
3255   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
3256   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
3257   // I think this is useless (throwing pc?)
3258   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
3259 
3260   // Pop self-frame.
3261   __ leave();                           // Epilog
3262 
3263   // Jump to interpreter
3264   __ ret(0);
3265 
3266   // Make sure all code is generated
3267   masm->flush();
3268 
3269   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
3270   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
3271 #if INCLUDE_JVMCI
3272   if (EnableJVMCI) {
3273     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
3274     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
3275   }
3276 #endif
3277 }
3278 
3279 //------------------------------generate_handler_blob------
3280 //
3281 // Generate a special Compile2Runtime blob that saves all registers,
3282 // and setup oopmap.
3283 //
3284 SafepointBlob* SharedRuntime::generate_handler_blob(SharedStubId id, address call_ptr) {
3285   assert(StubRoutines::forward_exception_entry() != nullptr,
3286          "must be generated before");
3287   assert(is_polling_page_id(id), "expected a polling page stub id");
3288 
3289   ResourceMark rm;
3290   OopMapSet *oop_maps = new OopMapSet();
3291   OopMap* map;
3292 
3293   // Allocate space for the code.  Setup code generation tools.
3294   const char* name = SharedRuntime::stub_name(id);
3295   CodeBuffer buffer(name, 2548, 1024);
3296   MacroAssembler* masm = new MacroAssembler(&buffer);
3297 
3298   address start   = __ pc();
3299   address call_pc = nullptr;
3300   int frame_size_in_words;
3301   bool cause_return = (id == SharedStubId::polling_page_return_handler_id);
3302   bool save_wide_vectors = (id == SharedStubId::polling_page_vectors_safepoint_handler_id);
3303 
3304   // Make room for return address (or push it again)
3305   if (!cause_return) {
3306     __ push(rbx);
3307   }
3308 
3309   // Save registers, fpu state, and flags
3310   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3311 
3312   // The following is basically a call_VM.  However, we need the precise
3313   // address of the call in order to generate an oopmap. Hence, we do all the
3314   // work ourselves.
3315 
3316   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3317 
3318   // The return address must always be correct so that frame constructor never
3319   // sees an invalid pc.
3320 
3321   if (!cause_return) {
3322     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3323     // Additionally, rbx is a callee saved register and we can look at it later to determine
3324     // if someone changed the return address for us!
3325     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3326     __ movptr(Address(rbp, wordSize), rbx);
3327   }
3328 
3329   // Do the call
3330   __ mov(c_rarg0, r15_thread);
3331   __ call(RuntimeAddress(call_ptr));
3332 
3333   // Set an oopmap for the call site.  This oopmap will map all
3334   // oop-registers and debug-info registers as callee-saved.  This
3335   // will allow deoptimization at this safepoint to find all possible
3336   // debug-info recordings, as well as let GC find all oops.
3337 
3338   oop_maps->add_gc_map( __ pc() - start, map);
3339 
3340   Label noException;
3341 
3342   __ reset_last_Java_frame(false);
3343 
3344   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3345   __ jcc(Assembler::equal, noException);
3346 
3347   // Exception pending
3348 
3349   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3350 
3351   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3352 
3353   // No exception case
3354   __ bind(noException);
3355 
3356   Label no_adjust;
3357 #ifdef ASSERT
3358   Label bail;
3359 #endif
3360   if (!cause_return) {
3361     Label no_prefix, not_special, check_rex_prefix;
3362 
3363     // If our stashed return pc was modified by the runtime we avoid touching it
3364     __ cmpptr(rbx, Address(rbp, wordSize));
3365     __ jcc(Assembler::notEqual, no_adjust);
3366 
3367     // Skip over the poll instruction.
3368     // See NativeInstruction::is_safepoint_poll()
3369     // Possible encodings:
3370     //      85 00       test   %eax,(%rax)
3371     //      85 01       test   %eax,(%rcx)
3372     //      85 02       test   %eax,(%rdx)
3373     //      85 03       test   %eax,(%rbx)
3374     //      85 06       test   %eax,(%rsi)
3375     //      85 07       test   %eax,(%rdi)
3376     //
3377     //   41 85 00       test   %eax,(%r8)
3378     //   41 85 01       test   %eax,(%r9)
3379     //   41 85 02       test   %eax,(%r10)
3380     //   41 85 03       test   %eax,(%r11)
3381     //   41 85 06       test   %eax,(%r14)
3382     //   41 85 07       test   %eax,(%r15)
3383     //
3384     //      85 04 24    test   %eax,(%rsp)
3385     //   41 85 04 24    test   %eax,(%r12)
3386     //      85 45 00    test   %eax,0x0(%rbp)
3387     //   41 85 45 00    test   %eax,0x0(%r13)
3388     //
3389     // Notes:
3390     //  Format of legacy MAP0 test instruction:-
3391     //  [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32]
3392     //  o  For safepoint polling instruction "test %eax,(%rax)", encoding of first register
3393     //     operand and base register of memory operand is b/w [0-8), hence we do not require
3394     //     additional REX prefix where REX.B bit stores MSB bit of register encoding, which
3395     //     is why two bytes encoding is sufficient here.
3396     //  o  For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE
3397     //     register of memory operand is 1000, thus we need additional REX prefix in this case,
3398     //     there by adding additional byte to instruction encoding.
3399     //  o  In case BASE register is one of the 32 extended GPR registers available only on targets
3400     //     supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold
3401     //     most significant two bits of 5 bit register encoding.
3402 
3403     if (VM_Version::supports_apx_f()) {
3404       __ cmpb(Address(rbx, 0), Assembler::REX2);
3405       __ jccb(Assembler::notEqual, check_rex_prefix);
3406       __ addptr(rbx, 2);
3407       __ bind(check_rex_prefix);
3408     }
3409     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3410     __ jccb(Assembler::notEqual, no_prefix);
3411     __ addptr(rbx, 1);
3412     __ bind(no_prefix);
3413 #ifdef ASSERT
3414     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3415 #endif
3416     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3417     // r12/rsp 0x04
3418     // r13/rbp 0x05
3419     __ movzbq(rcx, Address(rbx, 1));
3420     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3421     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3422     __ cmpptr(rcx, 1);
3423     __ jccb(Assembler::above, not_special);
3424     __ addptr(rbx, 1);
3425     __ bind(not_special);
3426 #ifdef ASSERT
3427     // Verify the correct encoding of the poll we're about to skip.
3428     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3429     __ jcc(Assembler::notEqual, bail);
3430     // Mask out the modrm bits
3431     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3432     // rax encodes to 0, so if the bits are nonzero it's incorrect
3433     __ jcc(Assembler::notZero, bail);
3434 #endif
3435     // Adjust return pc forward to step over the safepoint poll instruction
3436     __ addptr(rbx, 2);
3437     __ movptr(Address(rbp, wordSize), rbx);
3438   }
3439 
3440   __ bind(no_adjust);
3441   // Normal exit, restore registers and exit.
3442   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3443   __ ret(0);
3444 
3445 #ifdef ASSERT
3446   __ bind(bail);
3447   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3448 #endif
3449 
3450   // Make sure all code is generated
3451   masm->flush();
3452 
3453   // Fill-out other meta info
3454   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3455 }
3456 
3457 //
3458 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3459 //
3460 // Generate a stub that calls into vm to find out the proper destination
3461 // of a java call. All the argument registers are live at this point
3462 // but since this is generic code we don't know what they are and the caller
3463 // must do any gc of the args.
3464 //
3465 RuntimeStub* SharedRuntime::generate_resolve_blob(SharedStubId id, address destination) {
3466   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3467   assert(is_resolve_id(id), "expected a resolve stub id");
3468 
3469   // allocate space for the code
3470   ResourceMark rm;
3471 
3472   const char* name = SharedRuntime::stub_name(id);
3473   CodeBuffer buffer(name, 1552, 512);
3474   MacroAssembler* masm = new MacroAssembler(&buffer);
3475 
3476   int frame_size_in_words;
3477 
3478   OopMapSet *oop_maps = new OopMapSet();
3479   OopMap* map = nullptr;
3480 
3481   int start = __ offset();
3482 
3483   // No need to save vector registers since they are caller-saved anyway.
3484   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3485 
3486   int frame_complete = __ offset();
3487 
3488   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3489 
3490   __ mov(c_rarg0, r15_thread);
3491 
3492   __ call(RuntimeAddress(destination));
3493 
3494 
3495   // Set an oopmap for the call site.
3496   // We need this not only for callee-saved registers, but also for volatile
3497   // registers that the compiler might be keeping live across a safepoint.
3498 
3499   oop_maps->add_gc_map( __ offset() - start, map);
3500 
3501   // rax contains the address we are going to jump to assuming no exception got installed
3502 
3503   // clear last_Java_sp
3504   __ reset_last_Java_frame(false);
3505   // check for pending exceptions
3506   Label pending;
3507   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3508   __ jcc(Assembler::notEqual, pending);
3509 
3510   // get the returned Method*
3511   __ get_vm_result_2(rbx, r15_thread);
3512   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3513 
3514   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3515 
3516   RegisterSaver::restore_live_registers(masm);
3517 
3518   // We are back to the original state on entry and ready to go.
3519 
3520   __ jmp(rax);
3521 
3522   // Pending exception after the safepoint
3523 
3524   __ bind(pending);
3525 
3526   RegisterSaver::restore_live_registers(masm);
3527 
3528   // exception pending => remove activation and forward to exception handler
3529 
3530   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
3531 
3532   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3533   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3534 
3535   // -------------
3536   // make sure all code is generated
3537   masm->flush();
3538 
3539   // return the  blob
3540   // frame_size_words or bytes??
3541   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3542 }
3543 
3544 // Continuation point for throwing of implicit exceptions that are
3545 // not handled in the current activation. Fabricates an exception
3546 // oop and initiates normal exception dispatching in this
3547 // frame. Since we need to preserve callee-saved values (currently
3548 // only for C2, but done for C1 as well) we need a callee-saved oop
3549 // map and therefore have to make these stubs into RuntimeStubs
3550 // rather than BufferBlobs.  If the compiler needs all registers to
3551 // be preserved between the fault point and the exception handler
3552 // then it must assume responsibility for that in
3553 // AbstractCompiler::continuation_for_implicit_null_exception or
3554 // continuation_for_implicit_division_by_zero_exception. All other
3555 // implicit exceptions (e.g., NullPointerException or
3556 // AbstractMethodError on entry) are either at call sites or
3557 // otherwise assume that stack unwinding will be initiated, so
3558 // caller saved registers were assumed volatile in the compiler.
3559 RuntimeStub* SharedRuntime::generate_throw_exception(SharedStubId id, address runtime_entry) {
3560   assert(is_throw_id(id), "expected a throw stub id");
3561 
3562   const char* name = SharedRuntime::stub_name(id);
3563 
3564   // Information about frame layout at time of blocking runtime call.
3565   // Note that we only have to preserve callee-saved registers since
3566   // the compilers are responsible for supplying a continuation point
3567   // if they expect all registers to be preserved.
3568   enum layout {
3569     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3570     rbp_off2,
3571     return_off,
3572     return_off2,
3573     framesize // inclusive of return address
3574   };
3575 
3576   int insts_size = 512;
3577   int locs_size  = 64;
3578 
3579   ResourceMark rm;
3580   const char* timer_msg = "SharedRuntime generate_throw_exception";
3581   TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3582 
3583   CodeBuffer code(name, insts_size, locs_size);
3584   OopMapSet* oop_maps  = new OopMapSet();
3585   MacroAssembler* masm = new MacroAssembler(&code);
3586 
3587   address start = __ pc();
3588 
3589   // This is an inlined and slightly modified version of call_VM
3590   // which has the ability to fetch the return PC out of
3591   // thread-local storage and also sets up last_Java_sp slightly
3592   // differently than the real call_VM
3593 
3594   __ enter(); // required for proper stackwalking of RuntimeStub frame
3595 
3596   assert(is_even(framesize/2), "sp not 16-byte aligned");
3597 
3598   // return address and rbp are already in place
3599   __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3600 
3601   int frame_complete = __ pc() - start;
3602 
3603   // Set up last_Java_sp and last_Java_fp
3604   address the_pc = __ pc();
3605   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3606   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3607 
3608   // Call runtime
3609   __ movptr(c_rarg0, r15_thread);
3610   BLOCK_COMMENT("call runtime_entry");
3611   __ call(RuntimeAddress(runtime_entry));
3612 
3613   // Generate oop map
3614   OopMap* map = new OopMap(framesize, 0);
3615 
3616   oop_maps->add_gc_map(the_pc - start, map);
3617 
3618   __ reset_last_Java_frame(true);
3619 
3620   __ leave(); // required for proper stackwalking of RuntimeStub frame
3621 
3622   // check for pending exceptions
3623 #ifdef ASSERT
3624   Label L;
3625   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3626   __ jcc(Assembler::notEqual, L);
3627   __ should_not_reach_here();
3628   __ bind(L);
3629 #endif // ASSERT
3630   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3631 
3632 
3633   // codeBlob framesize is in words (not VMRegImpl::slot_size)
3634   RuntimeStub* stub =
3635     RuntimeStub::new_runtime_stub(name,
3636                                   &code,
3637                                   frame_complete,
3638                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3639                                   oop_maps, false);
3640   return stub;
3641 }
3642 
3643 //------------------------------Montgomery multiplication------------------------
3644 //
3645 
3646 #ifndef _WINDOWS
3647 
3648 // Subtract 0:b from carry:a.  Return carry.
3649 static julong
3650 sub(julong a[], julong b[], julong carry, long len) {
3651   long long i = 0, cnt = len;
3652   julong tmp;
3653   asm volatile("clc; "
3654                "0: ; "
3655                "mov (%[b], %[i], 8), %[tmp]; "
3656                "sbb %[tmp], (%[a], %[i], 8); "
3657                "inc %[i]; dec %[cnt]; "
3658                "jne 0b; "
3659                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3660                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3661                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3662                : "memory");
3663   return tmp;
3664 }
3665 
3666 // Multiply (unsigned) Long A by Long B, accumulating the double-
3667 // length result into the accumulator formed of T0, T1, and T2.
3668 #define MACC(A, B, T0, T1, T2)                                  \
3669 do {                                                            \
3670   unsigned long hi, lo;                                         \
3671   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3672            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3673            : "r"(A), "a"(B) : "cc");                            \
3674  } while(0)
3675 
3676 // As above, but add twice the double-length result into the
3677 // accumulator.
3678 #define MACC2(A, B, T0, T1, T2)                                 \
3679 do {                                                            \
3680   unsigned long hi, lo;                                         \
3681   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3682            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3683            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3684            : "r"(A), "a"(B) : "cc");                            \
3685  } while(0)
3686 
3687 #else //_WINDOWS
3688 
3689 static julong
3690 sub(julong a[], julong b[], julong carry, long len) {
3691   long i;
3692   julong tmp;
3693   unsigned char c = 1;
3694   for (i = 0; i < len; i++) {
3695     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3696     a[i] = tmp;
3697   }
3698   c = _addcarry_u64(c, carry, ~0, &tmp);
3699   return tmp;
3700 }
3701 
3702 // Multiply (unsigned) Long A by Long B, accumulating the double-
3703 // length result into the accumulator formed of T0, T1, and T2.
3704 #define MACC(A, B, T0, T1, T2)                          \
3705 do {                                                    \
3706   julong hi, lo;                            \
3707   lo = _umul128(A, B, &hi);                             \
3708   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3709   c = _addcarry_u64(c, hi, T1, &T1);                    \
3710   _addcarry_u64(c, T2, 0, &T2);                         \
3711  } while(0)
3712 
3713 // As above, but add twice the double-length result into the
3714 // accumulator.
3715 #define MACC2(A, B, T0, T1, T2)                         \
3716 do {                                                    \
3717   julong hi, lo;                            \
3718   lo = _umul128(A, B, &hi);                             \
3719   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3720   c = _addcarry_u64(c, hi, T1, &T1);                    \
3721   _addcarry_u64(c, T2, 0, &T2);                         \
3722   c = _addcarry_u64(0, lo, T0, &T0);                    \
3723   c = _addcarry_u64(c, hi, T1, &T1);                    \
3724   _addcarry_u64(c, T2, 0, &T2);                         \
3725  } while(0)
3726 
3727 #endif //_WINDOWS
3728 
3729 // Fast Montgomery multiplication.  The derivation of the algorithm is
3730 // in  A Cryptographic Library for the Motorola DSP56000,
3731 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3732 
3733 static void NOINLINE
3734 montgomery_multiply(julong a[], julong b[], julong n[],
3735                     julong m[], julong inv, int len) {
3736   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3737   int i;
3738 
3739   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3740 
3741   for (i = 0; i < len; i++) {
3742     int j;
3743     for (j = 0; j < i; j++) {
3744       MACC(a[j], b[i-j], t0, t1, t2);
3745       MACC(m[j], n[i-j], t0, t1, t2);
3746     }
3747     MACC(a[i], b[0], t0, t1, t2);
3748     m[i] = t0 * inv;
3749     MACC(m[i], n[0], t0, t1, t2);
3750 
3751     assert(t0 == 0, "broken Montgomery multiply");
3752 
3753     t0 = t1; t1 = t2; t2 = 0;
3754   }
3755 
3756   for (i = len; i < 2*len; i++) {
3757     int j;
3758     for (j = i-len+1; j < len; j++) {
3759       MACC(a[j], b[i-j], t0, t1, t2);
3760       MACC(m[j], n[i-j], t0, t1, t2);
3761     }
3762     m[i-len] = t0;
3763     t0 = t1; t1 = t2; t2 = 0;
3764   }
3765 
3766   while (t0)
3767     t0 = sub(m, n, t0, len);
3768 }
3769 
3770 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3771 // multiplies so it should be up to 25% faster than Montgomery
3772 // multiplication.  However, its loop control is more complex and it
3773 // may actually run slower on some machines.
3774 
3775 static void NOINLINE
3776 montgomery_square(julong a[], julong n[],
3777                   julong m[], julong inv, int len) {
3778   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3779   int i;
3780 
3781   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3782 
3783   for (i = 0; i < len; i++) {
3784     int j;
3785     int end = (i+1)/2;
3786     for (j = 0; j < end; j++) {
3787       MACC2(a[j], a[i-j], t0, t1, t2);
3788       MACC(m[j], n[i-j], t0, t1, t2);
3789     }
3790     if ((i & 1) == 0) {
3791       MACC(a[j], a[j], t0, t1, t2);
3792     }
3793     for (; j < i; j++) {
3794       MACC(m[j], n[i-j], t0, t1, t2);
3795     }
3796     m[i] = t0 * inv;
3797     MACC(m[i], n[0], t0, t1, t2);
3798 
3799     assert(t0 == 0, "broken Montgomery square");
3800 
3801     t0 = t1; t1 = t2; t2 = 0;
3802   }
3803 
3804   for (i = len; i < 2*len; i++) {
3805     int start = i-len+1;
3806     int end = start + (len - start)/2;
3807     int j;
3808     for (j = start; j < end; j++) {
3809       MACC2(a[j], a[i-j], t0, t1, t2);
3810       MACC(m[j], n[i-j], t0, t1, t2);
3811     }
3812     if ((i & 1) == 0) {
3813       MACC(a[j], a[j], t0, t1, t2);
3814     }
3815     for (; j < len; j++) {
3816       MACC(m[j], n[i-j], t0, t1, t2);
3817     }
3818     m[i-len] = t0;
3819     t0 = t1; t1 = t2; t2 = 0;
3820   }
3821 
3822   while (t0)
3823     t0 = sub(m, n, t0, len);
3824 }
3825 
3826 // Swap words in a longword.
3827 static julong swap(julong x) {
3828   return (x << 32) | (x >> 32);
3829 }
3830 
3831 // Copy len longwords from s to d, word-swapping as we go.  The
3832 // destination array is reversed.
3833 static void reverse_words(julong *s, julong *d, int len) {
3834   d += len;
3835   while(len-- > 0) {
3836     d--;
3837     *d = swap(*s);
3838     s++;
3839   }
3840 }
3841 
3842 // The threshold at which squaring is advantageous was determined
3843 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3844 #define MONTGOMERY_SQUARING_THRESHOLD 64
3845 
3846 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3847                                         jint len, jlong inv,
3848                                         jint *m_ints) {
3849   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3850   int longwords = len/2;
3851 
3852   // Make very sure we don't use so much space that the stack might
3853   // overflow.  512 jints corresponds to an 16384-bit integer and
3854   // will use here a total of 8k bytes of stack space.
3855   int divisor = sizeof(julong) * 4;
3856   guarantee(longwords <= 8192 / divisor, "must be");
3857   int total_allocation = longwords * sizeof (julong) * 4;
3858   julong *scratch = (julong *)alloca(total_allocation);
3859 
3860   // Local scratch arrays
3861   julong
3862     *a = scratch + 0 * longwords,
3863     *b = scratch + 1 * longwords,
3864     *n = scratch + 2 * longwords,
3865     *m = scratch + 3 * longwords;
3866 
3867   reverse_words((julong *)a_ints, a, longwords);
3868   reverse_words((julong *)b_ints, b, longwords);
3869   reverse_words((julong *)n_ints, n, longwords);
3870 
3871   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3872 
3873   reverse_words(m, (julong *)m_ints, longwords);
3874 }
3875 
3876 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3877                                       jint len, jlong inv,
3878                                       jint *m_ints) {
3879   assert(len % 2 == 0, "array length in montgomery_square must be even");
3880   int longwords = len/2;
3881 
3882   // Make very sure we don't use so much space that the stack might
3883   // overflow.  512 jints corresponds to an 16384-bit integer and
3884   // will use here a total of 6k bytes of stack space.
3885   int divisor = sizeof(julong) * 3;
3886   guarantee(longwords <= (8192 / divisor), "must be");
3887   int total_allocation = longwords * sizeof (julong) * 3;
3888   julong *scratch = (julong *)alloca(total_allocation);
3889 
3890   // Local scratch arrays
3891   julong
3892     *a = scratch + 0 * longwords,
3893     *n = scratch + 1 * longwords,
3894     *m = scratch + 2 * longwords;
3895 
3896   reverse_words((julong *)a_ints, a, longwords);
3897   reverse_words((julong *)n_ints, n, longwords);
3898 
3899   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3900     ::montgomery_square(a, n, m, (julong)inv, longwords);
3901   } else {
3902     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3903   }
3904 
3905   reverse_words(m, (julong *)m_ints, longwords);
3906 }
3907 
3908 BufferedInlineTypeBlob* SharedRuntime::generate_buffered_inline_type_adapter(const InlineKlass* vk) {
3909   BufferBlob* buf = BufferBlob::create("inline types pack/unpack", 16 * K);
3910   CodeBuffer buffer(buf);
3911   short buffer_locs[20];
3912   buffer.insts()->initialize_shared_locs((relocInfo*)buffer_locs,
3913                                          sizeof(buffer_locs)/sizeof(relocInfo));
3914 
3915   MacroAssembler* masm = new MacroAssembler(&buffer);
3916 
3917   const Array<SigEntry>* sig_vk = vk->extended_sig();
3918   const Array<VMRegPair>* regs = vk->return_regs();
3919 
3920   int pack_fields_jobject_off = __ offset();
3921   // Resolve pre-allocated buffer from JNI handle.
3922   // We cannot do this in generate_call_stub() because it requires GC code to be initialized.
3923   __ movptr(rax, Address(r13, 0));
3924   __ resolve_jobject(rax /* value */,
3925                      r15_thread /* thread */,
3926                      r12 /* tmp */);
3927   __ movptr(Address(r13, 0), rax);
3928 
3929   int pack_fields_off = __ offset();
3930 
3931   int j = 1;
3932   for (int i = 0; i < sig_vk->length(); i++) {
3933     BasicType bt = sig_vk->at(i)._bt;
3934     if (bt == T_METADATA) {
3935       continue;
3936     }
3937     if (bt == T_VOID) {
3938       if (sig_vk->at(i-1)._bt == T_LONG ||
3939           sig_vk->at(i-1)._bt == T_DOUBLE) {
3940         j++;
3941       }
3942       continue;
3943     }
3944     int off = sig_vk->at(i)._offset;
3945     assert(off > 0, "offset in object should be positive");
3946     VMRegPair pair = regs->at(j);
3947     VMReg r_1 = pair.first();
3948     VMReg r_2 = pair.second();
3949     Address to(rax, off);
3950     if (bt == T_FLOAT) {
3951       __ movflt(to, r_1->as_XMMRegister());
3952     } else if (bt == T_DOUBLE) {
3953       __ movdbl(to, r_1->as_XMMRegister());
3954     } else {
3955       Register val = r_1->as_Register();
3956       assert_different_registers(to.base(), val, r14, r13, rbx, rscratch1);
3957       if (is_reference_type(bt)) {
3958         __ store_heap_oop(to, val, r14, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
3959       } else {
3960         __ store_sized_value(to, r_1->as_Register(), type2aelembytes(bt));
3961       }
3962     }
3963     j++;
3964   }
3965   assert(j == regs->length(), "missed a field?");
3966   if (vk->has_nullable_atomic_layout()) {
3967     // Set the null marker
3968     __ movb(Address(rax, vk->null_marker_offset()), 1);
3969   }
3970   __ ret(0);
3971 
3972   int unpack_fields_off = __ offset();
3973 
3974   Label skip;
3975   __ testptr(rax, rax);
3976   __ jcc(Assembler::zero, skip);
3977 
3978   j = 1;
3979   for (int i = 0; i < sig_vk->length(); i++) {
3980     BasicType bt = sig_vk->at(i)._bt;
3981     if (bt == T_METADATA) {
3982       continue;
3983     }
3984     if (bt == T_VOID) {
3985       if (sig_vk->at(i-1)._bt == T_LONG ||
3986           sig_vk->at(i-1)._bt == T_DOUBLE) {
3987         j++;
3988       }
3989       continue;
3990     }
3991     int off = sig_vk->at(i)._offset;
3992     assert(off > 0, "offset in object should be positive");
3993     VMRegPair pair = regs->at(j);
3994     VMReg r_1 = pair.first();
3995     VMReg r_2 = pair.second();
3996     Address from(rax, off);
3997     if (bt == T_FLOAT) {
3998       __ movflt(r_1->as_XMMRegister(), from);
3999     } else if (bt == T_DOUBLE) {
4000       __ movdbl(r_1->as_XMMRegister(), from);
4001     } else if (bt == T_OBJECT || bt == T_ARRAY) {
4002       assert_different_registers(rax, r_1->as_Register());
4003       __ load_heap_oop(r_1->as_Register(), from);
4004     } else {
4005       assert(is_java_primitive(bt), "unexpected basic type");
4006       assert_different_registers(rax, r_1->as_Register());
4007       size_t size_in_bytes = type2aelembytes(bt);
4008       __ load_sized_value(r_1->as_Register(), from, size_in_bytes, bt != T_CHAR && bt != T_BOOLEAN);
4009     }
4010     j++;
4011   }
4012   assert(j == regs->length(), "missed a field?");
4013 
4014   __ bind(skip);
4015   __ ret(0);
4016 
4017   __ flush();
4018 
4019   return BufferedInlineTypeBlob::create(&buffer, pack_fields_off, pack_fields_jobject_off, unpack_fields_off);
4020 }
4021 
4022 #if INCLUDE_JFR
4023 
4024 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
4025 // It returns a jobject handle to the event writer.
4026 // The handle is dereferenced and the return value is the event writer oop.
4027 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
4028   enum layout {
4029     rbp_off,
4030     rbpH_off,
4031     return_off,
4032     return_off2,
4033     framesize // inclusive of return address
4034   };
4035 
4036   const char* name = SharedRuntime::stub_name(SharedStubId::jfr_write_checkpoint_id);
4037   CodeBuffer code(name, 1024, 64);
4038   MacroAssembler* masm = new MacroAssembler(&code);
4039   address start = __ pc();
4040 
4041   __ enter();
4042   address the_pc = __ pc();
4043 
4044   int frame_complete = the_pc - start;
4045 
4046   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
4047   __ movptr(c_rarg0, r15_thread);
4048   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
4049   __ reset_last_Java_frame(true);
4050 
4051   // rax is jobject handle result, unpack and process it through a barrier.
4052   __ resolve_global_jobject(rax, r15_thread, c_rarg0);
4053 
4054   __ leave();
4055   __ ret(0);
4056 
4057   OopMapSet* oop_maps = new OopMapSet();
4058   OopMap* map = new OopMap(framesize, 1);
4059   oop_maps->add_gc_map(frame_complete, map);
4060 
4061   RuntimeStub* stub =
4062     RuntimeStub::new_runtime_stub(name,
4063                                   &code,
4064                                   frame_complete,
4065                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4066                                   oop_maps,
4067                                   false);
4068   return stub;
4069 }
4070 
4071 // For c2: call to return a leased buffer.
4072 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
4073   enum layout {
4074     rbp_off,
4075     rbpH_off,
4076     return_off,
4077     return_off2,
4078     framesize // inclusive of return address
4079   };
4080 
4081   const char* name = SharedRuntime::stub_name(SharedStubId::jfr_return_lease_id);
4082   CodeBuffer code(name, 1024, 64);
4083   MacroAssembler* masm = new MacroAssembler(&code);
4084   address start = __ pc();
4085 
4086   __ enter();
4087   address the_pc = __ pc();
4088 
4089   int frame_complete = the_pc - start;
4090 
4091   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
4092   __ movptr(c_rarg0, r15_thread);
4093   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
4094   __ reset_last_Java_frame(true);
4095 
4096   __ leave();
4097   __ ret(0);
4098 
4099   OopMapSet* oop_maps = new OopMapSet();
4100   OopMap* map = new OopMap(framesize, 1);
4101   oop_maps->add_gc_map(frame_complete, map);
4102 
4103   RuntimeStub* stub =
4104     RuntimeStub::new_runtime_stub(name,
4105                                   &code,
4106                                   frame_complete,
4107                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4108                                   oop_maps,
4109                                   false);
4110   return stub;
4111 }
4112 
4113 #endif // INCLUDE_JFR